diff mbox series

[v2] osdep: add a qemu_close_all_open_fd() helper

Message ID 20240618111704.63092-1-cleger@rivosinc.com
State New
Headers show
Series [v2] osdep: add a qemu_close_all_open_fd() helper | expand

Commit Message

Clément Léger June 18, 2024, 11:17 a.m. UTC
Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
POSIX"), the maximum number of file descriptors that can be opened are
raised to nofile.rlim_max. On recent debian distro, this yield a maximum
of 1073741816 file descriptors. Now, when forking to start
qemu-bridge-helper, this actually calls close() on the full possible file
descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
takes a considerable amount of time. In order to reduce that time,
factorize existing code to close all open files descriptors in a new
qemu_close_all_open_fd() function. This function uses various methods
to close all the open file descriptors ranging from the most efficient
one to the least one. It also accepts an ordered array of file
descriptors that should not be closed since this is required by the
callers that calls it after forking.

Signed-off-by: Clément Léger <cleger@rivosinc.com>

----

v2:
 - Factorize async_teardown.c close_fds implementation as well as tap.c ones
 - Apply checkpatch
 - v1: https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/

---
 include/qemu/osdep.h    |   8 +++
 net/tap.c               |  31 ++++++-----
 system/async-teardown.c |  37 +------------
 util/osdep.c            | 115 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 141 insertions(+), 50 deletions(-)

Comments

Clément Léger July 11, 2024, 1:34 p.m. UTC | #1
Gentle ping ?

Thanks,

Clément

On 18/06/2024 13:17, Clément Léger wrote:
> Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
> POSIX"), the maximum number of file descriptors that can be opened are
> raised to nofile.rlim_max. On recent debian distro, this yield a maximum
> of 1073741816 file descriptors. Now, when forking to start
> qemu-bridge-helper, this actually calls close() on the full possible file
> descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
> takes a considerable amount of time. In order to reduce that time,
> factorize existing code to close all open files descriptors in a new
> qemu_close_all_open_fd() function. This function uses various methods
> to close all the open file descriptors ranging from the most efficient
> one to the least one. It also accepts an ordered array of file
> descriptors that should not be closed since this is required by the
> callers that calls it after forking.
> 
> Signed-off-by: Clément Léger <cleger@rivosinc.com>
> 
> ----
> 
> v2:
>  - Factorize async_teardown.c close_fds implementation as well as tap.c ones
>  - Apply checkpatch
>  - v1: https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/
> 
> ---
>  include/qemu/osdep.h    |   8 +++
>  net/tap.c               |  31 ++++++-----
>  system/async-teardown.c |  37 +------------
>  util/osdep.c            | 115 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 141 insertions(+), 50 deletions(-)
> 
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f61edcfdc2..9369a97d3d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -755,6 +755,14 @@ static inline void qemu_reset_optind(void)
>  
>  int qemu_fdatasync(int fd);
>  
> +/**
> + * Close all open file descriptors except the ones supplied in the @skip array
> + *
> + * @skip: ordered array of distinct file descriptors that should not be closed
> + * @nskip: number of entries in the @skip array.
> + */
> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
> +
>  /**
>   * Sync changes made to the memory mapped file back to the backing
>   * storage. For POSIX compliant systems this will fallback
> diff --git a/net/tap.c b/net/tap.c
> index 51f7aec39d..6fc3939078 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -385,6 +385,21 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
>      return s;
>  }
>  
> +static void close_all_fds_after_fork(int excluded_fd)
> +{
> +        const int skip_fd[] = {0, 1, 2, 3, excluded_fd};
> +        unsigned int nskip = ARRAY_SIZE(skip_fd);
> +
> +        /*
> +         * skip_fd must be an ordered array of distinct fds, exclude
> +         * excluded_fd if already included in the [0 - 3] range
> +         */
> +        if (excluded_fd <= 3) {
> +            nskip--;
> +        }
> +        qemu_close_all_open_fd(skip_fd, nskip);
> +}
> +
>  static void launch_script(const char *setup_script, const char *ifname,
>                            int fd, Error **errp)
>  {
> @@ -400,13 +415,7 @@ static void launch_script(const char *setup_script, const char *ifname,
>          return;
>      }
>      if (pid == 0) {
> -        int open_max = sysconf(_SC_OPEN_MAX), i;
> -
> -        for (i = 3; i < open_max; i++) {
> -            if (i != fd) {
> -                close(i);
> -            }
> -        }
> +        close_all_fds_after_fork(fd);
>          parg = args;
>          *parg++ = (char *)setup_script;
>          *parg++ = (char *)ifname;
> @@ -490,17 +499,11 @@ static int net_bridge_run_helper(const char *helper, const char *bridge,
>          return -1;
>      }
>      if (pid == 0) {
> -        int open_max = sysconf(_SC_OPEN_MAX), i;
>          char *fd_buf = NULL;
>          char *br_buf = NULL;
>          char *helper_cmd = NULL;
>  
> -        for (i = 3; i < open_max; i++) {
> -            if (i != sv[1]) {
> -                close(i);
> -            }
> -        }
> -
> +        close_all_fds_after_fork(sv[1]);
>          fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]);
>  
>          if (strrchr(helper, ' ') || strrchr(helper, '\t')) {
> diff --git a/system/async-teardown.c b/system/async-teardown.c
> index 396963c091..9148ee8d04 100644
> --- a/system/async-teardown.c
> +++ b/system/async-teardown.c
> @@ -26,40 +26,6 @@
>  
>  static pid_t the_ppid;
>  
> -/*
> - * Close all open file descriptors.
> - */
> -static void close_all_open_fd(void)
> -{
> -    struct dirent *de;
> -    int fd, dfd;
> -    DIR *dir;
> -
> -#ifdef CONFIG_CLOSE_RANGE
> -    int r = close_range(0, ~0U, 0);
> -    if (!r) {
> -        /* Success, no need to try other ways. */
> -        return;
> -    }
> -#endif
> -
> -    dir = opendir("/proc/self/fd");
> -    if (!dir) {
> -        /* If /proc is not mounted, there is nothing that can be done. */
> -        return;
> -    }
> -    /* Avoid closing the directory. */
> -    dfd = dirfd(dir);
> -
> -    for (de = readdir(dir); de; de = readdir(dir)) {
> -        fd = atoi(de->d_name);
> -        if (fd != dfd) {
> -            close(fd);
> -        }
> -    }
> -    closedir(dir);
> -}
> -
>  static void hup_handler(int signal)
>  {
>      /* Check every second if this process has been reparented. */
> @@ -85,9 +51,8 @@ static int async_teardown_fn(void *arg)
>      /*
>       * Close all file descriptors that might have been inherited from the
>       * main qemu process when doing clone, needed to make libvirt happy.
> -     * Not using close_range for increased compatibility with older kernels.
>       */
> -    close_all_open_fd();
> +    qemu_close_all_open_fd(NULL, 0);
>  
>      /* Set up a handler for SIGHUP and unblock SIGHUP. */
>      sigaction(SIGHUP, &sa, NULL);
> diff --git a/util/osdep.c b/util/osdep.c
> index 5d23bbfbec..f3710710e3 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -625,3 +625,118 @@ int qemu_fdatasync(int fd)
>      return fsync(fd);
>  #endif
>  }
> +
> +#ifdef CONFIG_LINUX
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> +    struct dirent *de;
> +    int fd, dfd;
> +    bool close_fd;
> +    DIR *dir;
> +    int i;
> +
> +    dir = opendir("/proc/self/fd");
> +    if (!dir) {
> +        /* If /proc is not mounted, there is nothing that can be done. */
> +        return false;
> +    }
> +    /* Avoid closing the directory. */
> +    dfd = dirfd(dir);
> +
> +    for (de = readdir(dir); de; de = readdir(dir)) {
> +        fd = atoi(de->d_name);
> +        close_fd = true;
> +        if (fd == dfd) {
> +            close_fd = false;
> +        } else {
> +            for (i = 0; i < nskip; i++) {
> +                if (fd == skip[i]) {
> +                    close_fd = false;
> +                    break;
> +                }
> +            }
> +        }
> +        if (close_fd) {
> +            close(fd);
> +        }
> +    }
> +    closedir(dir);
> +
> +    return true;
> +}
> +#else
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> +    return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_CLOSE_RANGE
> +static bool qemu_close_all_open_fd_close_range(const int *skip,
> +                                               unsigned int nskip)
> +{
> +    int max_fd = sysconf(_SC_OPEN_MAX) - 1;
> +    int first = 0, last = max_fd;
> +    int cur_skip = 0, ret;
> +
> +    do {
> +        if (nskip) {
> +            while (first == skip[cur_skip]) {
> +                cur_skip++;
> +                first++;
> +            }
> +            if (cur_skip < nskip) {
> +                last = skip[cur_skip] - 1;
> +            }
> +            if (last > max_fd) {
> +                last = max_fd;
> +                /*
> +                 * We can directly skip the remaining skip fds since the current
> +                 * one is already above the maximum supported one.
> +                 */
> +                cur_skip = nskip;
> +            }
> +            if (first > last) {
> +                break;
> +            }
> +        }
> +        ret = close_range(first, last, 0);
> +        if (ret < 0) {
> +            return false;
> +        }
> +        first = last + 1;
> +        last = max_fd;
> +    } while (cur_skip < nskip);
> +
> +    return true;
> +}
> +#else
> +static bool qemu_close_all_open_fd_close_range(const int *skip,
> +                                               unsigned int nskip)
> +{
> +    return false;
> +}
> +#endif
> +
> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
> +{
> +    int open_max = sysconf(_SC_OPEN_MAX);
> +    int cur_skip = 0, i;
> +
> +    if (qemu_close_all_open_fd_close_range(skip, nskip)) {
> +        return;
> +    }
> +
> +    if (qemu_close_all_open_fd_proc(skip, nskip)) {
> +        return;
> +    }
> +
> +    /* Fallback */
> +    for (i = 0; i < open_max; i++) {
> +        if (i == skip[cur_skip]) {
> +            cur_skip++;
> +            continue;
> +        }
> +        close(i);
> +    }
> +}
Richard Henderson July 11, 2024, 6:43 p.m. UTC | #2
On 6/18/24 04:17, Clément Léger wrote:
> Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
> POSIX"), the maximum number of file descriptors that can be opened are
> raised to nofile.rlim_max. On recent debian distro, this yield a maximum
> of 1073741816 file descriptors. Now, when forking to start
> qemu-bridge-helper, this actually calls close() on the full possible file
> descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
> takes a considerable amount of time. In order to reduce that time,
> factorize existing code to close all open files descriptors in a new
> qemu_close_all_open_fd() function. This function uses various methods
> to close all the open file descriptors ranging from the most efficient
> one to the least one. It also accepts an ordered array of file
> descriptors that should not be closed since this is required by the
> callers that calls it after forking.
> 
> Signed-off-by: Clément Léger <cleger@rivosinc.com>
> 
> ----
> 
> v2:
>   - Factorize async_teardown.c close_fds implementation as well as tap.c ones
>   - Apply checkpatch
>   - v1: https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/
> 
> ---
>   include/qemu/osdep.h    |   8 +++
>   net/tap.c               |  31 ++++++-----
>   system/async-teardown.c |  37 +------------
>   util/osdep.c            | 115 ++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 141 insertions(+), 50 deletions(-)
> 
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f61edcfdc2..9369a97d3d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -755,6 +755,14 @@ static inline void qemu_reset_optind(void)
>   
>   int qemu_fdatasync(int fd);
>   
> +/**
> + * Close all open file descriptors except the ones supplied in the @skip array
> + *
> + * @skip: ordered array of distinct file descriptors that should not be closed
> + * @nskip: number of entries in the @skip array.
> + */
> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
> +
>   /**
>    * Sync changes made to the memory mapped file back to the backing
>    * storage. For POSIX compliant systems this will fallback
> diff --git a/net/tap.c b/net/tap.c
> index 51f7aec39d..6fc3939078 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -385,6 +385,21 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
>       return s;
>   }
>   
> +static void close_all_fds_after_fork(int excluded_fd)
> +{
> +        const int skip_fd[] = {0, 1, 2, 3, excluded_fd};

3 should not be included here...

> +        unsigned int nskip = ARRAY_SIZE(skip_fd);
> +
> +        /*
> +         * skip_fd must be an ordered array of distinct fds, exclude
> +         * excluded_fd if already included in the [0 - 3] range
> +         */
> +        if (excluded_fd <= 3) {

or here -- stdin is 0, stdout is 1, stderr is 2.

Perhaps we need reminding of this and use the STD*_FILENO names instead of raw integer 
constants.


> @@ -400,13 +415,7 @@ static void launch_script(const char *setup_script, const char *ifname,
>           return;
>       }
>       if (pid == 0) {
> -        int open_max = sysconf(_SC_OPEN_MAX), i;
> -
> -        for (i = 3; i < open_max; i++) {
> -            if (i != fd) {
> -                close(i);
> -            }
> -        }

Note that the original *does* close 3.

> +#ifdef CONFIG_LINUX
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> +    struct dirent *de;
> +    int fd, dfd;
> +    bool close_fd;
> +    DIR *dir;
> +    int i;
> +
> +    dir = opendir("/proc/self/fd");
> +    if (!dir) {
> +        /* If /proc is not mounted, there is nothing that can be done. */
> +        return false;
> +    }
> +    /* Avoid closing the directory. */
> +    dfd = dirfd(dir);
> +
> +    for (de = readdir(dir); de; de = readdir(dir)) {
> +        fd = atoi(de->d_name);
> +        close_fd = true;
> +        if (fd == dfd) {
> +            close_fd = false;
> +        } else {
> +            for (i = 0; i < nskip; i++) {

The skip list is sorted, so you should remember the point of the last search and begin 
from there, and you should not search past fd < skip[i].


> +#else
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> +    return false;
> +}
> +#endif

I'm not fond of duplicating the function declaration.
I think it's better to move the ifdef inside:

static bool foo(...)
{
#ifdef XYZ
   impl
#else
   stub
#endif
}

> +
> +#ifdef CONFIG_CLOSE_RANGE
> +static bool qemu_close_all_open_fd_close_range(const int *skip,
> +                                               unsigned int nskip)
> +{
> +    int max_fd = sysconf(_SC_OPEN_MAX) - 1;
> +    int first = 0, last = max_fd;
> +    int cur_skip = 0, ret;
> +
> +    do {
> +        if (nskip) {
> +            while (first == skip[cur_skip]) {
> +                cur_skip++;
> +                first++;
> +            }

This fails to check cur_skip < nskip in the loop.
Mixing signed cur_skip with unsigned nskip is bad.

There seems to be no good reason for the separate "if (nskip)" check.
A proper check for cur_skip < nskip will work just fine with nskip == 0.

> +    /* Fallback */
> +    for (i = 0; i < open_max; i++) {
> +        if (i == skip[cur_skip]) {
> +            cur_skip++;
> +            continue;
> +        }
> +        close(i);
> +    }

Missing nskip test.


r~
Daniel P. Berrangé July 12, 2024, 3:12 p.m. UTC | #3
On Tue, Jun 18, 2024 at 01:17:03PM +0200, Clément Léger wrote:
> Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
> POSIX"), the maximum number of file descriptors that can be opened are
> raised to nofile.rlim_max. On recent debian distro, this yield a maximum
> of 1073741816 file descriptors. Now, when forking to start
> qemu-bridge-helper, this actually calls close() on the full possible file
> descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
> takes a considerable amount of time. In order to reduce that time,
> factorize existing code to close all open files descriptors in a new
> qemu_close_all_open_fd() function. This function uses various methods
> to close all the open file descriptors ranging from the most efficient
> one to the least one. It also accepts an ordered array of file
> descriptors that should not be closed since this is required by the
> callers that calls it after forking.
> 
> Signed-off-by: Clément Léger <cleger@rivosinc.com>
> 
> ----
> 
> v2:
>  - Factorize async_teardown.c close_fds implementation as well as tap.c ones
>  - Apply checkpatch
>  - v1: https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/
> 
> ---
>  include/qemu/osdep.h    |   8 +++
>  net/tap.c               |  31 ++++++-----
>  system/async-teardown.c |  37 +------------
>  util/osdep.c            | 115 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 141 insertions(+), 50 deletions(-)
> 
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f61edcfdc2..9369a97d3d 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -755,6 +755,14 @@ static inline void qemu_reset_optind(void)
>  
>  int qemu_fdatasync(int fd);
>  
> +/**
> + * Close all open file descriptors except the ones supplied in the @skip array
> + *
> + * @skip: ordered array of distinct file descriptors that should not be closed
> + * @nskip: number of entries in the @skip array.
> + */
> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
> +
>  /**
>   * Sync changes made to the memory mapped file back to the backing
>   * storage. For POSIX compliant systems this will fallback
> diff --git a/net/tap.c b/net/tap.c
> index 51f7aec39d..6fc3939078 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -385,6 +385,21 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
>      return s;
>  }
>  
> +static void close_all_fds_after_fork(int excluded_fd)
> +{
> +        const int skip_fd[] = {0, 1, 2, 3, excluded_fd};
> +        unsigned int nskip = ARRAY_SIZE(skip_fd);
> +
> +        /*
> +         * skip_fd must be an ordered array of distinct fds, exclude
> +         * excluded_fd if already included in the [0 - 3] range
> +         */
> +        if (excluded_fd <= 3) {
> +            nskip--;
> +        }
> +        qemu_close_all_open_fd(skip_fd, nskip);
> +}

This is slightly over-indented - 4 space is QEMU normal style.

> diff --git a/util/osdep.c b/util/osdep.c
> index 5d23bbfbec..f3710710e3 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -625,3 +625,118 @@ int qemu_fdatasync(int fd)
>      return fsync(fd);
>  #endif
>  }
> +
> +#ifdef CONFIG_LINUX
> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
> +{
> +    struct dirent *de;
> +    int fd, dfd;
> +    bool close_fd;
> +    DIR *dir;
> +    int i;
> +
> +    dir = opendir("/proc/self/fd");
> +    if (!dir) {
> +        /* If /proc is not mounted, there is nothing that can be done. */
> +        return false;
> +    }
> +    /* Avoid closing the directory. */
> +    dfd = dirfd(dir);
> +
> +    for (de = readdir(dir); de; de = readdir(dir)) {

Don't we need

   if (de->d_name[0] == '.') {
       continue;
   }

otherwise atoi will fail and we'll try to close(0) multiple
times.

> +        fd = atoi(de->d_name);
> +        close_fd = true;
> +        if (fd == dfd) {
> +            close_fd = false;
> +        } else {
> +            for (i = 0; i < nskip; i++) {
> +                if (fd == skip[i]) {
> +                    close_fd = false;
> +                    break;
> +                }
> +            }
> +        }
> +        if (close_fd) {
> +            close(fd);
> +        }
> +    }
> +    closedir(dir);
> +
> +    return true;
> +}

With regards,
Daniel
Clément Léger July 16, 2024, 12:37 p.m. UTC | #4
On 12/07/2024 17:12, Daniel P. Berrangé wrote:
> On Tue, Jun 18, 2024 at 01:17:03PM +0200, Clément Léger wrote:
>> Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
>> POSIX"), the maximum number of file descriptors that can be opened are
>> raised to nofile.rlim_max. On recent debian distro, this yield a maximum
>> of 1073741816 file descriptors. Now, when forking to start
>> qemu-bridge-helper, this actually calls close() on the full possible file
>> descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
>> takes a considerable amount of time. In order to reduce that time,
>> factorize existing code to close all open files descriptors in a new
>> qemu_close_all_open_fd() function. This function uses various methods
>> to close all the open file descriptors ranging from the most efficient
>> one to the least one. It also accepts an ordered array of file
>> descriptors that should not be closed since this is required by the
>> callers that calls it after forking.
>>
>> Signed-off-by: Clément Léger <cleger@rivosinc.com>
>>
>> ----
>>
>> v2:
>>  - Factorize async_teardown.c close_fds implementation as well as tap.c ones
>>  - Apply checkpatch
>>  - v1: https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/
>>
>> ---
>>  include/qemu/osdep.h    |   8 +++
>>  net/tap.c               |  31 ++++++-----
>>  system/async-teardown.c |  37 +------------
>>  util/osdep.c            | 115 ++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 141 insertions(+), 50 deletions(-)
>>
>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>> index f61edcfdc2..9369a97d3d 100644
>> --- a/include/qemu/osdep.h
>> +++ b/include/qemu/osdep.h
>> @@ -755,6 +755,14 @@ static inline void qemu_reset_optind(void)
>>  
>>  int qemu_fdatasync(int fd);
>>  
>> +/**
>> + * Close all open file descriptors except the ones supplied in the @skip array
>> + *
>> + * @skip: ordered array of distinct file descriptors that should not be closed
>> + * @nskip: number of entries in the @skip array.
>> + */
>> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
>> +
>>  /**
>>   * Sync changes made to the memory mapped file back to the backing
>>   * storage. For POSIX compliant systems this will fallback
>> diff --git a/net/tap.c b/net/tap.c
>> index 51f7aec39d..6fc3939078 100644
>> --- a/net/tap.c
>> +++ b/net/tap.c
>> @@ -385,6 +385,21 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
>>      return s;
>>  }
>>  
>> +static void close_all_fds_after_fork(int excluded_fd)
>> +{
>> +        const int skip_fd[] = {0, 1, 2, 3, excluded_fd};
>> +        unsigned int nskip = ARRAY_SIZE(skip_fd);
>> +
>> +        /*
>> +         * skip_fd must be an ordered array of distinct fds, exclude
>> +         * excluded_fd if already included in the [0 - 3] range
>> +         */
>> +        if (excluded_fd <= 3) {
>> +            nskip--;
>> +        }
>> +        qemu_close_all_open_fd(skip_fd, nskip);
>> +}
> 
> This is slightly over-indented - 4 space is QEMU normal style.

Indeed, my bad.

> 
>> diff --git a/util/osdep.c b/util/osdep.c
>> index 5d23bbfbec..f3710710e3 100644
>> --- a/util/osdep.c
>> +++ b/util/osdep.c
>> @@ -625,3 +625,118 @@ int qemu_fdatasync(int fd)
>>      return fsync(fd);
>>  #endif
>>  }
>> +
>> +#ifdef CONFIG_LINUX
>> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
>> +{
>> +    struct dirent *de;
>> +    int fd, dfd;
>> +    bool close_fd;
>> +    DIR *dir;
>> +    int i;
>> +
>> +    dir = opendir("/proc/self/fd");
>> +    if (!dir) {
>> +        /* If /proc is not mounted, there is nothing that can be done. */
>> +        return false;
>> +    }
>> +    /* Avoid closing the directory. */
>> +    dfd = dirfd(dir);
>> +
>> +    for (de = readdir(dir); de; de = readdir(dir)) {
> 
> Don't we need
> 
>    if (de->d_name[0] == '.') {
>        continue;
>    }
> 
> otherwise atoi will fail and we'll try to close(0) multiple
> times.

Yes, that seems like it was the case for the previous code. I'll fix that.

Thanks,

Clément

> 
>> +        fd = atoi(de->d_name);
>> +        close_fd = true;
>> +        if (fd == dfd) {
>> +            close_fd = false;
>> +        } else {
>> +            for (i = 0; i < nskip; i++) {
>> +                if (fd == skip[i]) {
>> +                    close_fd = false;
>> +                    break;
>> +                }
>> +            }
>> +        }
>> +        if (close_fd) {
>> +            close(fd);
>> +        }
>> +    }
>> +    closedir(dir);
>> +
>> +    return true;
>> +}
> 
> With regards,
> Daniel
Clément Léger July 16, 2024, 1:42 p.m. UTC | #5
On 11/07/2024 20:43, Richard Henderson wrote:
> On 6/18/24 04:17, Clément Léger wrote:
>> Since commit 03e471c41d8b ("qemu_init: increase NOFILE soft limit on
>> POSIX"), the maximum number of file descriptors that can be opened are
>> raised to nofile.rlim_max. On recent debian distro, this yield a maximum
>> of 1073741816 file descriptors. Now, when forking to start
>> qemu-bridge-helper, this actually calls close() on the full possible file
>> descriptor range (more precisely [3 - sysconf(_SC_OPEN_MAX)]) which
>> takes a considerable amount of time. In order to reduce that time,
>> factorize existing code to close all open files descriptors in a new
>> qemu_close_all_open_fd() function. This function uses various methods
>> to close all the open file descriptors ranging from the most efficient
>> one to the least one. It also accepts an ordered array of file
>> descriptors that should not be closed since this is required by the
>> callers that calls it after forking.
>>
>> Signed-off-by: Clément Léger <cleger@rivosinc.com>
>>
>> ----
>>
>> v2:
>>   - Factorize async_teardown.c close_fds implementation as well as
>> tap.c ones
>>   - Apply checkpatch
>>   - v1:
>> https://lore.kernel.org/qemu-devel/20240617162520.4045016-1-cleger@rivosinc.com/
>>
>> ---
>>   include/qemu/osdep.h    |   8 +++
>>   net/tap.c               |  31 ++++++-----
>>   system/async-teardown.c |  37 +------------
>>   util/osdep.c            | 115 ++++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 141 insertions(+), 50 deletions(-)
>>
>> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>> index f61edcfdc2..9369a97d3d 100644
>> --- a/include/qemu/osdep.h
>> +++ b/include/qemu/osdep.h
>> @@ -755,6 +755,14 @@ static inline void qemu_reset_optind(void)
>>     int qemu_fdatasync(int fd);
>>   +/**
>> + * Close all open file descriptors except the ones supplied in the
>> @skip array
>> + *
>> + * @skip: ordered array of distinct file descriptors that should not
>> be closed
>> + * @nskip: number of entries in the @skip array.
>> + */
>> +void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
>> +
>>   /**
>>    * Sync changes made to the memory mapped file back to the backing
>>    * storage. For POSIX compliant systems this will fallback
>> diff --git a/net/tap.c b/net/tap.c
>> index 51f7aec39d..6fc3939078 100644
>> --- a/net/tap.c
>> +++ b/net/tap.c
>> @@ -385,6 +385,21 @@ static TAPState *net_tap_fd_init(NetClientState
>> *peer,
>>       return s;
>>   }
>>   +static void close_all_fds_after_fork(int excluded_fd)
>> +{
>> +        const int skip_fd[] = {0, 1, 2, 3, excluded_fd};
> 
> 3 should not be included here...

Oh yes, my bad.

> 
>> +        unsigned int nskip = ARRAY_SIZE(skip_fd);
>> +
>> +        /*
>> +         * skip_fd must be an ordered array of distinct fds, exclude
>> +         * excluded_fd if already included in the [0 - 3] range
>> +         */
>> +        if (excluded_fd <= 3) {
> 
> or here -- stdin is 0, stdout is 1, stderr is 2.
> 
> Perhaps we need reminding of this and use the STD*_FILENO names instead
> of raw integer constants.

Indeed, I'll switch to using STD*_FILENO.

> 
> 
>> @@ -400,13 +415,7 @@ static void launch_script(const char
>> *setup_script, const char *ifname,
>>           return;
>>       }
>>       if (pid == 0) {
>> -        int open_max = sysconf(_SC_OPEN_MAX), i;
>> -
>> -        for (i = 3; i < open_max; i++) {
>> -            if (i != fd) {
>> -                close(i);
>> -            }
>> -        }
> 
> Note that the original *does* close 3.
> 
>> +#ifdef CONFIG_LINUX
>> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int
>> nskip)
>> +{
>> +    struct dirent *de;
>> +    int fd, dfd;
>> +    bool close_fd;
>> +    DIR *dir;
>> +    int i;
>> +
>> +    dir = opendir("/proc/self/fd");
>> +    if (!dir) {
>> +        /* If /proc is not mounted, there is nothing that can be
>> done. */
>> +        return false;
>> +    }
>> +    /* Avoid closing the directory. */
>> +    dfd = dirfd(dir);
>> +
>> +    for (de = readdir(dir); de; de = readdir(dir)) {
>> +        fd = atoi(de->d_name);
>> +        close_fd = true;
>> +        if (fd == dfd) {
>> +            close_fd = false;
>> +        } else {
>> +            for (i = 0; i < nskip; i++) {
> 
> The skip list is sorted, so you should remember the point of the last
> search and begin from there, and you should not search past fd < skip[i].

readdir values are not ordered so the best I can do is restrict
start/end once found in the fds and bail out when < skip[i] as you
pointed out.

for (i = skip_start; i < skip_end; i++) {
    if (fd < skip[i]) {
        /* We are below the next skipped fd, break */
        break;
    } else if (fd == skip[i]) {
        close_fd = false;
        /* Restrict the range as we found fds matching start/end */
        if (i == skip_start)
            skip_start++;
        else if (i == skip_end)
            skip_end--;
        break;
    }
}


> 
> 
>> +#else
>> +static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int
>> nskip)
>> +{
>> +    return false;
>> +}
>> +#endif
> 
> I'm not fond of duplicating the function declaration.
> I think it's better to move the ifdef inside:
> 
> static bool foo(...)
> {
> #ifdef XYZ
>   impl
> #else
>   stub
> #endif
> }
> 

Acked.

>> +
>> +#ifdef CONFIG_CLOSE_RANGE
>> +static bool qemu_close_all_open_fd_close_range(const int *skip,
>> +                                               unsigned int nskip)
>> +{
>> +    int max_fd = sysconf(_SC_OPEN_MAX) - 1;
>> +    int first = 0, last = max_fd;
>> +    int cur_skip = 0, ret;
>> +
>> +    do {
>> +        if (nskip) {
>> +            while (first == skip[cur_skip]) {
>> +                cur_skip++;
>> +                first++;
>> +            }
> 
> This fails to check cur_skip < nskip in the loop.
> Mixing signed cur_skip with unsigned nskip is bad.
> 
> There seems to be no good reason for the separate "if (nskip)" check.
> A proper check for cur_skip < nskip will work just fine with nskip == 0.

I'll try to rework that.

> 
>> +    /* Fallback */
>> +    for (i = 0; i < open_max; i++) {
>> +        if (i == skip[cur_skip]) {
>> +            cur_skip++;
>> +            continue;
>> +        }
>> +        close(i);
>> +    }
> 
> Missing nskip test.
I'll fix that.

Thanks,

Clément

> 
> 
> r~
>
diff mbox series

Patch

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index f61edcfdc2..9369a97d3d 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -755,6 +755,14 @@  static inline void qemu_reset_optind(void)
 
 int qemu_fdatasync(int fd);
 
+/**
+ * Close all open file descriptors except the ones supplied in the @skip array
+ *
+ * @skip: ordered array of distinct file descriptors that should not be closed
+ * @nskip: number of entries in the @skip array.
+ */
+void qemu_close_all_open_fd(const int *skip, unsigned int nskip);
+
 /**
  * Sync changes made to the memory mapped file back to the backing
  * storage. For POSIX compliant systems this will fallback
diff --git a/net/tap.c b/net/tap.c
index 51f7aec39d..6fc3939078 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -385,6 +385,21 @@  static TAPState *net_tap_fd_init(NetClientState *peer,
     return s;
 }
 
+static void close_all_fds_after_fork(int excluded_fd)
+{
+        const int skip_fd[] = {0, 1, 2, 3, excluded_fd};
+        unsigned int nskip = ARRAY_SIZE(skip_fd);
+
+        /*
+         * skip_fd must be an ordered array of distinct fds, exclude
+         * excluded_fd if already included in the [0 - 3] range
+         */
+        if (excluded_fd <= 3) {
+            nskip--;
+        }
+        qemu_close_all_open_fd(skip_fd, nskip);
+}
+
 static void launch_script(const char *setup_script, const char *ifname,
                           int fd, Error **errp)
 {
@@ -400,13 +415,7 @@  static void launch_script(const char *setup_script, const char *ifname,
         return;
     }
     if (pid == 0) {
-        int open_max = sysconf(_SC_OPEN_MAX), i;
-
-        for (i = 3; i < open_max; i++) {
-            if (i != fd) {
-                close(i);
-            }
-        }
+        close_all_fds_after_fork(fd);
         parg = args;
         *parg++ = (char *)setup_script;
         *parg++ = (char *)ifname;
@@ -490,17 +499,11 @@  static int net_bridge_run_helper(const char *helper, const char *bridge,
         return -1;
     }
     if (pid == 0) {
-        int open_max = sysconf(_SC_OPEN_MAX), i;
         char *fd_buf = NULL;
         char *br_buf = NULL;
         char *helper_cmd = NULL;
 
-        for (i = 3; i < open_max; i++) {
-            if (i != sv[1]) {
-                close(i);
-            }
-        }
-
+        close_all_fds_after_fork(sv[1]);
         fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]);
 
         if (strrchr(helper, ' ') || strrchr(helper, '\t')) {
diff --git a/system/async-teardown.c b/system/async-teardown.c
index 396963c091..9148ee8d04 100644
--- a/system/async-teardown.c
+++ b/system/async-teardown.c
@@ -26,40 +26,6 @@ 
 
 static pid_t the_ppid;
 
-/*
- * Close all open file descriptors.
- */
-static void close_all_open_fd(void)
-{
-    struct dirent *de;
-    int fd, dfd;
-    DIR *dir;
-
-#ifdef CONFIG_CLOSE_RANGE
-    int r = close_range(0, ~0U, 0);
-    if (!r) {
-        /* Success, no need to try other ways. */
-        return;
-    }
-#endif
-
-    dir = opendir("/proc/self/fd");
-    if (!dir) {
-        /* If /proc is not mounted, there is nothing that can be done. */
-        return;
-    }
-    /* Avoid closing the directory. */
-    dfd = dirfd(dir);
-
-    for (de = readdir(dir); de; de = readdir(dir)) {
-        fd = atoi(de->d_name);
-        if (fd != dfd) {
-            close(fd);
-        }
-    }
-    closedir(dir);
-}
-
 static void hup_handler(int signal)
 {
     /* Check every second if this process has been reparented. */
@@ -85,9 +51,8 @@  static int async_teardown_fn(void *arg)
     /*
      * Close all file descriptors that might have been inherited from the
      * main qemu process when doing clone, needed to make libvirt happy.
-     * Not using close_range for increased compatibility with older kernels.
      */
-    close_all_open_fd();
+    qemu_close_all_open_fd(NULL, 0);
 
     /* Set up a handler for SIGHUP and unblock SIGHUP. */
     sigaction(SIGHUP, &sa, NULL);
diff --git a/util/osdep.c b/util/osdep.c
index 5d23bbfbec..f3710710e3 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -625,3 +625,118 @@  int qemu_fdatasync(int fd)
     return fsync(fd);
 #endif
 }
+
+#ifdef CONFIG_LINUX
+static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
+{
+    struct dirent *de;
+    int fd, dfd;
+    bool close_fd;
+    DIR *dir;
+    int i;
+
+    dir = opendir("/proc/self/fd");
+    if (!dir) {
+        /* If /proc is not mounted, there is nothing that can be done. */
+        return false;
+    }
+    /* Avoid closing the directory. */
+    dfd = dirfd(dir);
+
+    for (de = readdir(dir); de; de = readdir(dir)) {
+        fd = atoi(de->d_name);
+        close_fd = true;
+        if (fd == dfd) {
+            close_fd = false;
+        } else {
+            for (i = 0; i < nskip; i++) {
+                if (fd == skip[i]) {
+                    close_fd = false;
+                    break;
+                }
+            }
+        }
+        if (close_fd) {
+            close(fd);
+        }
+    }
+    closedir(dir);
+
+    return true;
+}
+#else
+static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
+{
+    return false;
+}
+#endif
+
+#ifdef CONFIG_CLOSE_RANGE
+static bool qemu_close_all_open_fd_close_range(const int *skip,
+                                               unsigned int nskip)
+{
+    int max_fd = sysconf(_SC_OPEN_MAX) - 1;
+    int first = 0, last = max_fd;
+    int cur_skip = 0, ret;
+
+    do {
+        if (nskip) {
+            while (first == skip[cur_skip]) {
+                cur_skip++;
+                first++;
+            }
+            if (cur_skip < nskip) {
+                last = skip[cur_skip] - 1;
+            }
+            if (last > max_fd) {
+                last = max_fd;
+                /*
+                 * We can directly skip the remaining skip fds since the current
+                 * one is already above the maximum supported one.
+                 */
+                cur_skip = nskip;
+            }
+            if (first > last) {
+                break;
+            }
+        }
+        ret = close_range(first, last, 0);
+        if (ret < 0) {
+            return false;
+        }
+        first = last + 1;
+        last = max_fd;
+    } while (cur_skip < nskip);
+
+    return true;
+}
+#else
+static bool qemu_close_all_open_fd_close_range(const int *skip,
+                                               unsigned int nskip)
+{
+    return false;
+}
+#endif
+
+void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
+{
+    int open_max = sysconf(_SC_OPEN_MAX);
+    int cur_skip = 0, i;
+
+    if (qemu_close_all_open_fd_close_range(skip, nskip)) {
+        return;
+    }
+
+    if (qemu_close_all_open_fd_proc(skip, nskip)) {
+        return;
+    }
+
+    /* Fallback */
+    for (i = 0; i < open_max; i++) {
+        if (i == skip[cur_skip]) {
+            cur_skip++;
+            continue;
+        }
+        close(i);
+    }
+}