diff mbox series

[PATCHv4,iproute2,1/2] lib/libnetlink: re malloc buff if size is not enough

Message ID 1506605626-1744-2-git-send-email-haliu@redhat.com
State Changes Requested, archived
Delegated to: stephen hemminger
Headers show
Series libnetlink: malloc correct buff at run time | expand

Commit Message

Hangbin Liu Sept. 28, 2017, 1:33 p.m. UTC
From: Hangbin Liu <liuhangbin@gmail.com>

With commit 72b365e8e0fd ("libnetlink: Double the dump buffer size")
we doubled the buffer size to support more VFs. But the VFs number is
increasing all the time. Some customers even use more than 200 VFs now.

We could not double it everytime when the buffer is not enough. Let's just
not hard code the buffer size and malloc the correct number when running.

Introduce function rtnl_recvmsg() to always return a newly allocated buffer.
The caller need to free it after using.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
 lib/libnetlink.c | 114 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 34 deletions(-)

Comments

Michal Kubecek Sept. 29, 2017, 12:55 p.m. UTC | #1
On Thu, Sep 28, 2017 at 09:33:45PM +0800, Hangbin Liu wrote:
> From: Hangbin Liu <liuhangbin@gmail.com>
> 
> With commit 72b365e8e0fd ("libnetlink: Double the dump buffer size")
> we doubled the buffer size to support more VFs. But the VFs number is
> increasing all the time. Some customers even use more than 200 VFs now.
> 
> We could not double it everytime when the buffer is not enough. Let's just
> not hard code the buffer size and malloc the correct number when running.
> 
> Introduce function rtnl_recvmsg() to always return a newly allocated buffer.
> The caller need to free it after using.
> 
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> Signed-off-by: Phil Sutter <phil@nwl.cc>
> ---
>  lib/libnetlink.c | 114 ++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 80 insertions(+), 34 deletions(-)
> 

Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Stephen Hemminger Sept. 29, 2017, 5:54 p.m. UTC | #2
On Thu, 28 Sep 2017 21:33:45 +0800
Hangbin Liu <haliu@redhat.com> wrote:

>  
> +static int __rtnl_recvmsg(int fd, struct msghdr *msg, int flags)
> +{
> +	int len;
> +
> +	do {
> +		len = recvmsg(fd, msg, flags);
> +	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
> +
> +	if (len < 0) {
> +		fprintf(stderr, "netlink receive error %s (%d)\n",
> +			strerror(errno), errno);
> +		return -errno;
> +	}
> +
> +	if (len == 0) {
> +		fprintf(stderr, "EOF on netlink\n");
> +		return -ENODATA;
> +	}
> +
> +	return len;
> +}
> +
> +static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
> +{
> +	struct iovec *iov = msg->msg_iov;
> +	char *buf;
> +	int len;
> +
> +	iov->iov_base = NULL;
> +	iov->iov_len = 0;
> +
> +	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
> +	if (len < 0)
> +		return len;
> +
> +	buf = malloc(len);
> +	if (!buf) {
> +		fprintf(stderr, "malloc error: not enough buffer\n");
> +		return -ENOMEM;
> +	}
> +
> +	iov->iov_base = buf;
> +	iov->iov_len = len;
> +
> +	len = __rtnl_recvmsg(fd, msg, 0);
> +	if (len < 0) {
> +		free(buf);
> +		return len;
> +	}
> +
> +	if (answer)
> +		*answer = buf;
> +	else
> +		free(buf);
> +
> +	return len;
> +}

Doubling the number of system calls per message is not going to make
users with 5,000,000 routes or 1000 vlans, or 10,000 tunnels happy.
Please rethink this.
Michal Kubecek Sept. 29, 2017, 6:20 p.m. UTC | #3
On Fri, Sep 29, 2017 at 10:54:40AM -0700, Stephen Hemminger wrote:
> On Thu, 28 Sep 2017 21:33:45 +0800
> Hangbin Liu <haliu@redhat.com> wrote:
> 
> >  
> > +static int __rtnl_recvmsg(int fd, struct msghdr *msg, int flags)
> > +{
> > +	int len;
> > +
> > +	do {
> > +		len = recvmsg(fd, msg, flags);
> > +	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
> > +
> > +	if (len < 0) {
> > +		fprintf(stderr, "netlink receive error %s (%d)\n",
> > +			strerror(errno), errno);
> > +		return -errno;
> > +	}
> > +
> > +	if (len == 0) {
> > +		fprintf(stderr, "EOF on netlink\n");
> > +		return -ENODATA;
> > +	}
> > +
> > +	return len;
> > +}
> > +
> > +static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
> > +{
> > +	struct iovec *iov = msg->msg_iov;
> > +	char *buf;
> > +	int len;
> > +
> > +	iov->iov_base = NULL;
> > +	iov->iov_len = 0;
> > +
> > +	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
> > +	if (len < 0)
> > +		return len;
> > +
> > +	buf = malloc(len);
> > +	if (!buf) {
> > +		fprintf(stderr, "malloc error: not enough buffer\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	iov->iov_base = buf;
> > +	iov->iov_len = len;
> > +
> > +	len = __rtnl_recvmsg(fd, msg, 0);
> > +	if (len < 0) {
> > +		free(buf);
> > +		return len;
> > +	}
> > +
> > +	if (answer)
> > +		*answer = buf;
> > +	else
> > +		free(buf);
> > +
> > +	return len;
> > +}
> 
> Doubling the number of system calls per message is not going to make
> users with 5,000,000 routes or 1000 vlans, or 10,000 tunnels happy.
> Please rethink this.

I'm not sure it's possible to avoid this if we want to be able to get
rid of a preset message length limit. If you call recvmsg() without
MSG_PEEK and your buffer isn't sufficiently large, the message is lost.
And once you use MSG_PEEK, you need another syscall to remove the
message from the queue even if you read all data. In other words, to be
sure you don't lose the reply, you have to do two syscalls.

One alternative I can see would be calling recvmsg() without MSG_PEEK
(but with reasonably large buffer) and repeating the request if the
buffer is not large enough (and caller is actually interested in the
answer). But I don't think this is desirable either as that would result
in even worse overhead.

Michal Kubecek
Hangbin Liu Sept. 30, 2017, 1:54 p.m. UTC | #4
Hi Stephen,
On Fri, Sep 29, 2017 at 10:54:40AM -0700, Stephen Hemminger wrote:
> 
> Doubling the number of system calls per message is not going to make
> users with 5,000,000 routes or 1000 vlans, or 10,000 tunnels happy.
> Please rethink this.

I tried to add 2500 vlans and 70,000 routes. Then show the result. The
time looks reasonable.

# ip link show | wc -l
5024

# time ip link show > /dev/null

real    0m0.218s
user    0m0.007s
sys     0m0.210s

# time iproute2/ip/ip link show > /dev/null

real    0m0.221s
user    0m0.008s
sys     0m0.212s

# time ip addr show > /dev/null

real    0m0.299s
user    0m0.094s
sys     0m0.205s

# time iproute2/ip/ip addr show > /dev/null

real    0m0.302s
user    0m0.099s
sys     0m0.202s

# ip -6 route show | wc -l
704458

# time ip -6 route show > /dev/null

real    0m5.400s
user    0m0.947s
sys     0m4.453s

# time iproute2/ip/ip -6 route show > /dev/null

real    0m5.404s
user    0m1.070s
sys     0m4.333s


Thanks
Hangbin
diff mbox series

Patch

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index be7ac86..1847c0b 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -402,6 +402,64 @@  static void rtnl_dump_error(const struct rtnl_handle *rth,
 	}
 }
 
+static int __rtnl_recvmsg(int fd, struct msghdr *msg, int flags)
+{
+	int len;
+
+	do {
+		len = recvmsg(fd, msg, flags);
+	} while (len < 0 && (errno == EINTR || errno == EAGAIN));
+
+	if (len < 0) {
+		fprintf(stderr, "netlink receive error %s (%d)\n",
+			strerror(errno), errno);
+		return -errno;
+	}
+
+	if (len == 0) {
+		fprintf(stderr, "EOF on netlink\n");
+		return -ENODATA;
+	}
+
+	return len;
+}
+
+static int rtnl_recvmsg(int fd, struct msghdr *msg, char **answer)
+{
+	struct iovec *iov = msg->msg_iov;
+	char *buf;
+	int len;
+
+	iov->iov_base = NULL;
+	iov->iov_len = 0;
+
+	len = __rtnl_recvmsg(fd, msg, MSG_PEEK | MSG_TRUNC);
+	if (len < 0)
+		return len;
+
+	buf = malloc(len);
+	if (!buf) {
+		fprintf(stderr, "malloc error: not enough buffer\n");
+		return -ENOMEM;
+	}
+
+	iov->iov_base = buf;
+	iov->iov_len = len;
+
+	len = __rtnl_recvmsg(fd, msg, 0);
+	if (len < 0) {
+		free(buf);
+		return len;
+	}
+
+	if (answer)
+		*answer = buf;
+	else
+		free(buf);
+
+	return len;
+}
+
 int rtnl_dump_filter_l(struct rtnl_handle *rth,
 		       const struct rtnl_dump_filter_arg *arg)
 {
@@ -413,31 +471,18 @@  int rtnl_dump_filter_l(struct rtnl_handle *rth,
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 	};
-	char buf[32768];
+	char *buf;
 	int dump_intr = 0;
 
-	iov.iov_base = buf;
 	while (1) {
 		int status;
 		const struct rtnl_dump_filter_arg *a;
 		int found_done = 0;
 		int msglen = 0;
 
-		iov.iov_len = sizeof(buf);
-		status = recvmsg(rth->fd, &msg, 0);
-
-		if (status < 0) {
-			if (errno == EINTR || errno == EAGAIN)
-				continue;
-			fprintf(stderr, "netlink receive error %s (%d)\n",
-				strerror(errno), errno);
-			return -1;
-		}
-
-		if (status == 0) {
-			fprintf(stderr, "EOF on netlink\n");
-			return -1;
-		}
+		status = rtnl_recvmsg(rth->fd, &msg, &buf);
+		if (status < 0)
+			return status;
 
 		if (rth->dump_fp)
 			fwrite(buf, 1, NLMSG_ALIGN(status), rth->dump_fp);
@@ -462,8 +507,10 @@  int rtnl_dump_filter_l(struct rtnl_handle *rth,
 
 				if (h->nlmsg_type == NLMSG_DONE) {
 					err = rtnl_dump_done(h);
-					if (err < 0)
+					if (err < 0) {
+						free(buf);
 						return -1;
+					}
 
 					found_done = 1;
 					break; /* process next filter */
@@ -471,19 +518,23 @@  int rtnl_dump_filter_l(struct rtnl_handle *rth,
 
 				if (h->nlmsg_type == NLMSG_ERROR) {
 					rtnl_dump_error(rth, h);
+					free(buf);
 					return -1;
 				}
 
 				if (!rth->dump_fp) {
 					err = a->filter(&nladdr, h, a->arg1);
-					if (err < 0)
+					if (err < 0) {
+						free(buf);
 						return err;
+					}
 				}
 
 skip_it:
 				h = NLMSG_NEXT(h, msglen);
 			}
 		}
+		free(buf);
 
 		if (found_done) {
 			if (dump_intr)
@@ -543,7 +594,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		.msg_iov = &iov,
 		.msg_iovlen = 1,
 	};
-	char   buf[32768] = {};
+	char *buf;
 
 	n->nlmsg_seq = seq = ++rtnl->seq;
 
@@ -556,22 +607,12 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 		return -1;
 	}
 
-	iov.iov_base = buf;
 	while (1) {
-		iov.iov_len = sizeof(buf);
-		status = recvmsg(rtnl->fd, &msg, 0);
+		status = rtnl_recvmsg(rtnl->fd, &msg, &buf);
+
+		if (status < 0)
+			return status;
 
-		if (status < 0) {
-			if (errno == EINTR || errno == EAGAIN)
-				continue;
-			fprintf(stderr, "netlink receive error %s (%d)\n",
-				strerror(errno), errno);
-			return -1;
-		}
-		if (status == 0) {
-			fprintf(stderr, "EOF on netlink\n");
-			return -1;
-		}
 		if (msg.msg_namelen != sizeof(nladdr)) {
 			fprintf(stderr,
 				"sender address length == %d\n",
@@ -585,6 +626,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 			if (l < 0 || len > status) {
 				if (msg.msg_flags & MSG_TRUNC) {
 					fprintf(stderr, "Truncated message\n");
+					free(buf);
 					return -1;
 				}
 				fprintf(stderr,
@@ -611,6 +653,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					if (answer)
 						memcpy(answer, h,
 						       MIN(maxlen, h->nlmsg_len));
+					free(buf);
 					return 0;
 				}
 
@@ -619,12 +662,14 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 					rtnl_talk_error(h, err, errfn);
 
 				errno = -err->error;
+				free(buf);
 				return -1;
 			}
 
 			if (answer) {
 				memcpy(answer, h,
 				       MIN(maxlen, h->nlmsg_len));
+				free(buf);
 				return 0;
 			}
 
@@ -633,6 +678,7 @@  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
 			status -= NLMSG_ALIGN(len);
 			h = (struct nlmsghdr *)((char *)h + NLMSG_ALIGN(len));
 		}
+		free(buf);
 
 		if (msg.msg_flags & MSG_TRUNC) {
 			fprintf(stderr, "Message truncated\n");