diff mbox series

[3/5] um: Do a double clone to disable rseq

Message ID 20240528085419.1964424-4-benjamin@sipsolutions.net
State Superseded
Headers show
Series Increased address space for 64 bit | expand

Commit Message

Benjamin Berg May 28, 2024, 8:54 a.m. UTC
From: Benjamin Berg <benjamin.berg@intel.com>

Newer glibc versions are enabling rseq support by default. This remains
enabled in the cloned child process, potentially causing the host kernel
to write/read memory in the child.

It appears that this was purely not an issue because the used memory
area happened to be above TASK_SIZE and remains mapped.

Note that a better approach would be to exec a small static binary that
does not link with other libraries. Using a memfd and execveat the
binary could be embedded into UML itself and it would result in an
entirely clean execution environment for userspace.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
---
 arch/um/os-Linux/skas/process.c | 54 ++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 4 deletions(-)

Comments

Tiwei Bie May 28, 2024, 10:16 a.m. UTC | #1
On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
> From: Benjamin Berg <benjamin.berg@intel.com>
> 
> Newer glibc versions are enabling rseq support by default. This remains
> enabled in the cloned child process, potentially causing the host kernel
> to write/read memory in the child.
> 
> It appears that this was purely not an issue because the used memory
> area happened to be above TASK_SIZE and remains mapped.

I also encountered this issue. In my case, with "Force a static link"
(CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
it starts up. I worked around this by setting the glibc.pthread.rseq
tunable via GLIBC_TUNABLES [1] before launching UML.

So another easy way to work around this issue without introducing runtime
overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
variable and exec /proc/self/exe in UML on startup.

[1] https://www.gnu.org/software/libc/manual/html_node/Tunables.html

Regards,
Tiwei

> 
> Note that a better approach would be to exec a small static binary that
> does not link with other libraries. Using a memfd and execveat the
> binary could be embedded into UML itself and it would result in an
> entirely clean execution environment for userspace.
> 
> Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
> ---
>  arch/um/os-Linux/skas/process.c | 54 ++++++++++++++++++++++++++++++---
>  1 file changed, 50 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
> index 41a288dcfc34..ee332a2aeea6 100644
> --- a/arch/um/os-Linux/skas/process.c
> +++ b/arch/um/os-Linux/skas/process.c
> @@ -255,6 +255,31 @@ static int userspace_tramp(void *stack)
>  int userspace_pid[NR_CPUS];
>  int kill_userspace_mm[NR_CPUS];
>  
> +struct tramp_data {
> +	int pid;
> +	void *clone_sp;
> +	void *stack;
> +};
> +
> +static int userspace_tramp_clone_vm(void *data)
> +{
> +	struct tramp_data *tramp_data = data;
> +
> +	/*
> +	 * This helper exist to do a double-clone. First with CLONE_VM which
> +	 * effectively disables things like rseq, and then the second one to
> +	 * get a new memory space.
> +	 */
> +
> +	tramp_data->pid = clone(userspace_tramp, tramp_data->clone_sp,
> +				CLONE_PARENT | CLONE_FILES | SIGCHLD,
> +				tramp_data->stack);
> +	if (tramp_data->pid < 0)
> +		tramp_data->pid = -errno;
> +
> +	exit(0);
> +}
> +
>  /**
>   * start_userspace() - prepare a new userspace process
>   * @stub_stack:	pointer to the stub stack.
> @@ -268,9 +293,10 @@ int kill_userspace_mm[NR_CPUS];
>   */
>  int start_userspace(unsigned long stub_stack)
>  {
> +	struct tramp_data tramp_data;
>  	void *stack;
>  	unsigned long sp;
> -	int pid, status, n, flags, err;
> +	int pid, status, n, err;
>  
>  	/* setup a temporary stack page */
>  	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
> @@ -286,10 +312,13 @@ int start_userspace(unsigned long stub_stack)
>  	/* set stack pointer to the end of the stack page, so it can grow downwards */
>  	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
>  
> -	flags = CLONE_FILES | SIGCHLD;
> +	tramp_data.stack = (void *) stub_stack;
> +	tramp_data.clone_sp = (void *) sp;
> +	tramp_data.pid = -EINVAL;
>  
>  	/* clone into new userspace process */
> -	pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
> +	pid = clone(userspace_tramp_clone_vm, (void *) sp,
> +		    CLONE_VM | CLONE_FILES | SIGCHLD, &tramp_data);
>  	if (pid < 0) {
>  		err = -errno;
>  		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
> @@ -305,7 +334,24 @@ int start_userspace(unsigned long stub_stack)
>  			       __func__, errno);
>  			goto out_kill;
>  		}
> -	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
> +	} while (!WIFEXITED(status));
> +
> +	pid = tramp_data.pid;
> +	if (pid < 0) {
> +		printk(UM_KERN_ERR "%s : second clone failed, errno = %d\n",
> +		       __func__, -pid);
> +		return pid;
> +	}
> +
> +	do {
> +		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
> +		if (n < 0) {
> +			err = -errno;
> +			printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
> +			       __func__, errno);
> +			goto out_kill;
> +		}
> +	} while (WIFEXITED(status) && (WSTOPSIG(status) == SIGALRM));
>  
>  	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
>  		err = -EINVAL;
Benjamin Berg May 28, 2024, 10:30 a.m. UTC | #2
Hi Tiwei,

On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
> On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
> > From: Benjamin Berg <benjamin.berg@intel.com>
> > 
> > Newer glibc versions are enabling rseq support by default. This remains
> > enabled in the cloned child process, potentially causing the host kernel
> > to write/read memory in the child.
> > 
> > It appears that this was purely not an issue because the used memory
> > area happened to be above TASK_SIZE and remains mapped.
> 
> I also encountered this issue. In my case, with "Force a static link"
> (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
> it starts up. I worked around this by setting the glibc.pthread.rseq
> tunable via GLIBC_TUNABLES [1] before launching UML.
> 
> So another easy way to work around this issue without introducing runtime
> overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
> variable and exec /proc/self/exe in UML on startup.

I am not really worried about the overhead, but I agree that setting
GLIBC_TUNABLES is also a reasonable solution to the problem.

Doing the memfd/execveat dance with an embedded static binary would
still be best in my view, but either this or GLIBC_TUNABLES seem fine
in the meantime.

Do you want to submit the patch? Should I re-roll the patchset with
GLIBC_TUNABLES?

Benjamin

> [1] https://www.gnu.org/software/libc/manual/html_node/Tunables.html
> 
> Regards,
> Tiwei
> 
> > 
> > Note that a better approach would be to exec a small static binary that
> > does not link with other libraries. Using a memfd and execveat the
> > binary could be embedded into UML itself and it would result in an
> > entirely clean execution environment for userspace.
> > 
> > Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
> > ---
> >  arch/um/os-Linux/skas/process.c | 54 ++++++++++++++++++++++++++++++---
> >  1 file changed, 50 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
> > index 41a288dcfc34..ee332a2aeea6 100644
> > --- a/arch/um/os-Linux/skas/process.c
> > +++ b/arch/um/os-Linux/skas/process.c
> > @@ -255,6 +255,31 @@ static int userspace_tramp(void *stack)
> >  int userspace_pid[NR_CPUS];
> >  int kill_userspace_mm[NR_CPUS];
> >  
> > +struct tramp_data {
> > +	int pid;
> > +	void *clone_sp;
> > +	void *stack;
> > +};
> > +
> > +static int userspace_tramp_clone_vm(void *data)
> > +{
> > +	struct tramp_data *tramp_data = data;
> > +
> > +	/*
> > +	 * This helper exist to do a double-clone. First with CLONE_VM which
> > +	 * effectively disables things like rseq, and then the second one to
> > +	 * get a new memory space.
> > +	 */
> > +
> > +	tramp_data->pid = clone(userspace_tramp, tramp_data->clone_sp,
> > +				CLONE_PARENT | CLONE_FILES | SIGCHLD,
> > +				tramp_data->stack);
> > +	if (tramp_data->pid < 0)
> > +		tramp_data->pid = -errno;
> > +
> > +	exit(0);
> > +}
> > +
> >  /**
> >   * start_userspace() - prepare a new userspace process
> >   * @stub_stack:	pointer to the stub stack.
> > @@ -268,9 +293,10 @@ int kill_userspace_mm[NR_CPUS];
> >   */
> >  int start_userspace(unsigned long stub_stack)
> >  {
> > +	struct tramp_data tramp_data;
> >  	void *stack;
> >  	unsigned long sp;
> > -	int pid, status, n, flags, err;
> > +	int pid, status, n, err;
> >  
> >  	/* setup a temporary stack page */
> >  	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
> > @@ -286,10 +312,13 @@ int start_userspace(unsigned long stub_stack)
> >  	/* set stack pointer to the end of the stack page, so it can grow downwards */
> >  	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
> >  
> > -	flags = CLONE_FILES | SIGCHLD;
> > +	tramp_data.stack = (void *) stub_stack;
> > +	tramp_data.clone_sp = (void *) sp;
> > +	tramp_data.pid = -EINVAL;
> >  
> >  	/* clone into new userspace process */
> > -	pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
> > +	pid = clone(userspace_tramp_clone_vm, (void *) sp,
> > +		    CLONE_VM | CLONE_FILES | SIGCHLD, &tramp_data);
> >  	if (pid < 0) {
> >  		err = -errno;
> >  		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
> > @@ -305,7 +334,24 @@ int start_userspace(unsigned long stub_stack)
> >  			       __func__, errno);
> >  			goto out_kill;
> >  		}
> > -	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
> > +	} while (!WIFEXITED(status));
> > +
> > +	pid = tramp_data.pid;
> > +	if (pid < 0) {
> > +		printk(UM_KERN_ERR "%s : second clone failed, errno = %d\n",
> > +		       __func__, -pid);
> > +		return pid;
> > +	}
> > +
> > +	do {
> > +		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
> > +		if (n < 0) {
> > +			err = -errno;
> > +			printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
> > +			       __func__, errno);
> > +			goto out_kill;
> > +		}
> > +	} while (WIFEXITED(status) && (WSTOPSIG(status) == SIGALRM));
> >  
> >  	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
> >  		err = -EINVAL;
> 
>
Tiwei Bie May 28, 2024, 11:03 a.m. UTC | #3
Hi Benjamin,

On 5/28/24 6:30 PM, Benjamin Berg wrote:
> On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
>> On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
>>> From: Benjamin Berg <benjamin.berg@intel.com>
>>>
>>> Newer glibc versions are enabling rseq support by default. This remains
>>> enabled in the cloned child process, potentially causing the host kernel
>>> to write/read memory in the child.
>>>
>>> It appears that this was purely not an issue because the used memory
>>> area happened to be above TASK_SIZE and remains mapped.
>>
>> I also encountered this issue. In my case, with "Force a static link"
>> (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
>> it starts up. I worked around this by setting the glibc.pthread.rseq
>> tunable via GLIBC_TUNABLES [1] before launching UML.
>>
>> So another easy way to work around this issue without introducing runtime
>> overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
>> variable and exec /proc/self/exe in UML on startup.
> 
> I am not really worried about the overhead, but I agree that setting
> GLIBC_TUNABLES is also a reasonable solution to the problem.
> 
> Doing the memfd/execveat dance with an embedded static binary would
> still be best in my view, but either this or GLIBC_TUNABLES seem fine
> in the meantime.
> 
> Do you want to submit the patch? Should I re-roll the patchset with
> GLIBC_TUNABLES?

Thanks for asking! :)

I don't have such a patch at the moment. I worked around this issue
using a script. I saw that you are already doing execve("/proc/self/exe",
...) in PATCH 4/5. Please just feel free to re-roll the patchset with
GLIBC_TUNABLES if you would like to choose this solution.

Regards,
Tiwei


> 
> Benjamin
> 
>> [1] https://www.gnu.org/software/libc/manual/html_node/Tunables.html
>>
>> Regards,
>> Tiwei
>>
>>>
>>> Note that a better approach would be to exec a small static binary that
>>> does not link with other libraries. Using a memfd and execveat the
>>> binary could be embedded into UML itself and it would result in an
>>> entirely clean execution environment for userspace.
>>>
>>> Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
>>> ---
>>>  arch/um/os-Linux/skas/process.c | 54 ++++++++++++++++++++++++++++++---
>>>  1 file changed, 50 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
>>> index 41a288dcfc34..ee332a2aeea6 100644
>>> --- a/arch/um/os-Linux/skas/process.c
>>> +++ b/arch/um/os-Linux/skas/process.c
>>> @@ -255,6 +255,31 @@ static int userspace_tramp(void *stack)
>>>  int userspace_pid[NR_CPUS];
>>>  int kill_userspace_mm[NR_CPUS];
>>>  
>>> +struct tramp_data {
>>> +	int pid;
>>> +	void *clone_sp;
>>> +	void *stack;
>>> +};
>>> +
>>> +static int userspace_tramp_clone_vm(void *data)
>>> +{
>>> +	struct tramp_data *tramp_data = data;
>>> +
>>> +	/*
>>> +	 * This helper exist to do a double-clone. First with CLONE_VM which
>>> +	 * effectively disables things like rseq, and then the second one to
>>> +	 * get a new memory space.
>>> +	 */
>>> +
>>> +	tramp_data->pid = clone(userspace_tramp, tramp_data->clone_sp,
>>> +				CLONE_PARENT | CLONE_FILES | SIGCHLD,
>>> +				tramp_data->stack);
>>> +	if (tramp_data->pid < 0)
>>> +		tramp_data->pid = -errno;
>>> +
>>> +	exit(0);
>>> +}
>>> +
>>>  /**
>>>   * start_userspace() - prepare a new userspace process
>>>   * @stub_stack:	pointer to the stub stack.
>>> @@ -268,9 +293,10 @@ int kill_userspace_mm[NR_CPUS];
>>>   */
>>>  int start_userspace(unsigned long stub_stack)
>>>  {
>>> +	struct tramp_data tramp_data;
>>>  	void *stack;
>>>  	unsigned long sp;
>>> -	int pid, status, n, flags, err;
>>> +	int pid, status, n, err;
>>>  
>>>  	/* setup a temporary stack page */
>>>  	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
>>> @@ -286,10 +312,13 @@ int start_userspace(unsigned long stub_stack)
>>>  	/* set stack pointer to the end of the stack page, so it can grow downwards */
>>>  	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
>>>  
>>> -	flags = CLONE_FILES | SIGCHLD;
>>> +	tramp_data.stack = (void *) stub_stack;
>>> +	tramp_data.clone_sp = (void *) sp;
>>> +	tramp_data.pid = -EINVAL;
>>>  
>>>  	/* clone into new userspace process */
>>> -	pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
>>> +	pid = clone(userspace_tramp_clone_vm, (void *) sp,
>>> +		    CLONE_VM | CLONE_FILES | SIGCHLD, &tramp_data);
>>>  	if (pid < 0) {
>>>  		err = -errno;
>>>  		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
>>> @@ -305,7 +334,24 @@ int start_userspace(unsigned long stub_stack)
>>>  			       __func__, errno);
>>>  			goto out_kill;
>>>  		}
>>> -	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
>>> +	} while (!WIFEXITED(status));
>>> +
>>> +	pid = tramp_data.pid;
>>> +	if (pid < 0) {
>>> +		printk(UM_KERN_ERR "%s : second clone failed, errno = %d\n",
>>> +		       __func__, -pid);
>>> +		return pid;
>>> +	}
>>> +
>>> +	do {
>>> +		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
>>> +		if (n < 0) {
>>> +			err = -errno;
>>> +			printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
>>> +			       __func__, errno);
>>> +			goto out_kill;
>>> +		}
>>> +	} while (WIFEXITED(status) && (WSTOPSIG(status) == SIGALRM));
>>>  
>>>  	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
>>>  		err = -EINVAL;
>>
>>
Johannes Berg May 28, 2024, 11:57 a.m. UTC | #4
On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
> On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
> > From: Benjamin Berg <benjamin.berg@intel.com>
> > 
> > Newer glibc versions are enabling rseq support by default. This remains
> > enabled in the cloned child process, potentially causing the host kernel
> > to write/read memory in the child.
> > 
> > It appears that this was purely not an issue because the used memory
> > area happened to be above TASK_SIZE and remains mapped.
> 
> I also encountered this issue. In my case, with "Force a static link"
> (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
> it starts up. I worked around this by setting the glibc.pthread.rseq
> tunable via GLIBC_TUNABLES [1] before launching UML.
> 
> So another easy way to work around this issue without introducing runtime
> overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
> variable and exec /proc/self/exe in UML on startup.
> 

It's also a bit of a question what to rely on - this would introduce a
dependency on glibc behaviour, whereas doing the double-clone proposed
here will work purely because of host kernel behaviour, regardless of
what part of the system set up rseq, how the tunables work, etc.

johannes
Tiwei Bie May 28, 2024, 2:13 p.m. UTC | #5
On 5/28/24 7:57 PM, Johannes Berg wrote:
> On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
>> On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
>>> From: Benjamin Berg <benjamin.berg@intel.com>
>>>
>>> Newer glibc versions are enabling rseq support by default. This remains
>>> enabled in the cloned child process, potentially causing the host kernel
>>> to write/read memory in the child.
>>>
>>> It appears that this was purely not an issue because the used memory
>>> area happened to be above TASK_SIZE and remains mapped.
>>
>> I also encountered this issue. In my case, with "Force a static link"
>> (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
>> it starts up. I worked around this by setting the glibc.pthread.rseq
>> tunable via GLIBC_TUNABLES [1] before launching UML.
>>
>> So another easy way to work around this issue without introducing runtime
>> overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
>> variable and exec /proc/self/exe in UML on startup.
>>
> 
> It's also a bit of a question what to rely on - this would introduce a
> dependency on glibc behaviour, whereas doing the double-clone proposed
> here will work purely because of host kernel behaviour, regardless of
> what part of the system set up rseq, how the tunables work, etc.

Makes sense. My previous concern was primarily about the runtime overhead,
but after taking a closer look at the patch, I realized that the double-clone
won't happen on the critical path, so there shouldn't be any performance
issues. I also think the double-clone proposal is better. :)

Regards,
Tiwei
Tiwei Bie May 30, 2024, 2:54 a.m. UTC | #6
On 5/28/24 10:13 PM, Tiwei Bie wrote:
> On 5/28/24 7:57 PM, Johannes Berg wrote:
>> On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
>>> On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
>>>> From: Benjamin Berg <benjamin.berg@intel.com>
>>>>
>>>> Newer glibc versions are enabling rseq support by default. This remains
>>>> enabled in the cloned child process, potentially causing the host kernel
>>>> to write/read memory in the child.
>>>>
>>>> It appears that this was purely not an issue because the used memory
>>>> area happened to be above TASK_SIZE and remains mapped.
>>>
>>> I also encountered this issue. In my case, with "Force a static link"
>>> (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
>>> it starts up. I worked around this by setting the glibc.pthread.rseq
>>> tunable via GLIBC_TUNABLES [1] before launching UML.
>>>
>>> So another easy way to work around this issue without introducing runtime
>>> overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
>>> variable and exec /proc/self/exe in UML on startup.
>>>
>>
>> It's also a bit of a question what to rely on - this would introduce a
>> dependency on glibc behaviour, whereas doing the double-clone proposed
>> here will work purely because of host kernel behaviour, regardless of
>> what part of the system set up rseq, how the tunables work, etc.
> 
> Makes sense. My previous concern was primarily about the runtime overhead,
> but after taking a closer look at the patch, I realized that the double-clone
> won't happen on the critical path, so there shouldn't be any performance
> issues. I also think the double-clone proposal is better. :)

But when combined with this series [1], things might be different..
Double-clone will happen for each new mm context. That's something
we might want to avoid.

[1] https://patchwork.ozlabs.org/project/linux-um/list/?series=408104

Regards,
Tiwei
Benjamin Berg May 30, 2024, 8:54 a.m. UTC | #7
Hi,

On Thu, 2024-05-30 at 10:54 +0800, Tiwei Bie wrote:
> On 5/28/24 10:13 PM, Tiwei Bie wrote:
> > On 5/28/24 7:57 PM, Johannes Berg wrote:
> > > On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
> > > > On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
> > > > > From: Benjamin Berg <benjamin.berg@intel.com>
> > > > > 
> > > > > Newer glibc versions are enabling rseq support by default. This remains
> > > > > enabled in the cloned child process, potentially causing the host kernel
> > > > > to write/read memory in the child.
> > > > > 
> > > > > It appears that this was purely not an issue because the used memory
> > > > > area happened to be above TASK_SIZE and remains mapped.
> > > > 
> > > > I also encountered this issue. In my case, with "Force a static link"
> > > > (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
> > > > it starts up. I worked around this by setting the glibc.pthread.rseq
> > > > tunable via GLIBC_TUNABLES [1] before launching UML.
> > > > 
> > > > So another easy way to work around this issue without introducing runtime
> > > > overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
> > > > variable and exec /proc/self/exe in UML on startup.
> > > > 
> > > 
> > > It's also a bit of a question what to rely on - this would introduce a
> > > dependency on glibc behaviour, whereas doing the double-clone proposed
> > > here will work purely because of host kernel behaviour, regardless of
> > > what part of the system set up rseq, how the tunables work, etc.
> > 
> > Makes sense. My previous concern was primarily about the runtime overhead,
> > but after taking a closer look at the patch, I realized that the double-clone
> > won't happen on the critical path, so there shouldn't be any performance
> > issues. I also think the double-clone proposal is better. :)
> 
> But when combined with this series [1], things might be different..
> Double-clone will happen for each new mm context. That's something
> we might want to avoid.

I cannot believe that this overhead is something to worry about. The
CLONE_VM step should be really fast compared to the second clone as it
runs in the same MM as the kernel (it is how posix_spawn avoids the
fork overhead to execute another process if possible).

Note that using execve in the second step would speed things up even
more as the process will then run in a new MM instead of copying the
kernel MM and cleaning it.

That said, this patch can be made simpler by using CLONE_VFORK.

Benjamin

> 
> [1] https://patchwork.ozlabs.org/project/linux-um/list/?series=408104
> 
> Regards,
> Tiwei
> 
>
Tiwei Bie May 30, 2024, 2:05 p.m. UTC | #8
On 5/30/24 4:54 PM, Benjamin Berg wrote:
> Hi,
> 
> On Thu, 2024-05-30 at 10:54 +0800, Tiwei Bie wrote:
>> On 5/28/24 10:13 PM, Tiwei Bie wrote:
>>> On 5/28/24 7:57 PM, Johannes Berg wrote:
>>>> On Tue, 2024-05-28 at 18:16 +0800, Tiwei Bie wrote:
>>>>> On 5/28/24 4:54 PM, benjamin@sipsolutions.net wrote:
>>>>>> From: Benjamin Berg <benjamin.berg@intel.com>
>>>>>>
>>>>>> Newer glibc versions are enabling rseq support by default. This remains
>>>>>> enabled in the cloned child process, potentially causing the host kernel
>>>>>> to write/read memory in the child.
>>>>>>
>>>>>> It appears that this was purely not an issue because the used memory
>>>>>> area happened to be above TASK_SIZE and remains mapped.
>>>>>
>>>>> I also encountered this issue. In my case, with "Force a static link"
>>>>> (CONFIG_STATIC_LINK) enabled, UML will crash immediately every time
>>>>> it starts up. I worked around this by setting the glibc.pthread.rseq
>>>>> tunable via GLIBC_TUNABLES [1] before launching UML.
>>>>>
>>>>> So another easy way to work around this issue without introducing runtime
>>>>> overhead might be to add the GLIBC_TUNABLES=glibc.pthread.rseq=0 environment
>>>>> variable and exec /proc/self/exe in UML on startup.
>>>>>
>>>>
>>>> It's also a bit of a question what to rely on - this would introduce a
>>>> dependency on glibc behaviour, whereas doing the double-clone proposed
>>>> here will work purely because of host kernel behaviour, regardless of
>>>> what part of the system set up rseq, how the tunables work, etc.
>>>
>>> Makes sense. My previous concern was primarily about the runtime overhead,
>>> but after taking a closer look at the patch, I realized that the double-clone
>>> won't happen on the critical path, so there shouldn't be any performance
>>> issues. I also think the double-clone proposal is better. :)
>>
>> But when combined with this series [1], things might be different..
>> Double-clone will happen for each new mm context. That's something
>> we might want to avoid.
> 
> I cannot believe that this overhead is something to worry about. The
> CLONE_VM step should be really fast compared to the second clone as it
> runs in the same MM as the kernel (it is how posix_spawn avoids the
> fork overhead to execute another process if possible).

Hmm.. I just think that creating a temporary "thread" every time to do this
is perhaps a bit unnecessary. But I also agree that using GLIBC_TUNABLES in
UML will introduce a dependency on glibc behaviour, which is undesirable.
Honestly, I don't have a strong opinion on this.

Regards,
Tiwei

> 
> Note that using execve in the second step would speed things up even
> more as the process will then run in a new MM instead of copying the
> kernel MM and cleaning it.
> 
> That said, this patch can be made simpler by using CLONE_VFORK.
> 
> Benjamin
> 
>>
>> [1] https://patchwork.ozlabs.org/project/linux-um/list/?series=408104
>>
>> Regards,
>> Tiwei
>>
>>
diff mbox series

Patch

diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 41a288dcfc34..ee332a2aeea6 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -255,6 +255,31 @@  static int userspace_tramp(void *stack)
 int userspace_pid[NR_CPUS];
 int kill_userspace_mm[NR_CPUS];
 
+struct tramp_data {
+	int pid;
+	void *clone_sp;
+	void *stack;
+};
+
+static int userspace_tramp_clone_vm(void *data)
+{
+	struct tramp_data *tramp_data = data;
+
+	/*
+	 * This helper exist to do a double-clone. First with CLONE_VM which
+	 * effectively disables things like rseq, and then the second one to
+	 * get a new memory space.
+	 */
+
+	tramp_data->pid = clone(userspace_tramp, tramp_data->clone_sp,
+				CLONE_PARENT | CLONE_FILES | SIGCHLD,
+				tramp_data->stack);
+	if (tramp_data->pid < 0)
+		tramp_data->pid = -errno;
+
+	exit(0);
+}
+
 /**
  * start_userspace() - prepare a new userspace process
  * @stub_stack:	pointer to the stub stack.
@@ -268,9 +293,10 @@  int kill_userspace_mm[NR_CPUS];
  */
 int start_userspace(unsigned long stub_stack)
 {
+	struct tramp_data tramp_data;
 	void *stack;
 	unsigned long sp;
-	int pid, status, n, flags, err;
+	int pid, status, n, err;
 
 	/* setup a temporary stack page */
 	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
@@ -286,10 +312,13 @@  int start_userspace(unsigned long stub_stack)
 	/* set stack pointer to the end of the stack page, so it can grow downwards */
 	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
 
-	flags = CLONE_FILES | SIGCHLD;
+	tramp_data.stack = (void *) stub_stack;
+	tramp_data.clone_sp = (void *) sp;
+	tramp_data.pid = -EINVAL;
 
 	/* clone into new userspace process */
-	pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
+	pid = clone(userspace_tramp_clone_vm, (void *) sp,
+		    CLONE_VM | CLONE_FILES | SIGCHLD, &tramp_data);
 	if (pid < 0) {
 		err = -errno;
 		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
@@ -305,7 +334,24 @@  int start_userspace(unsigned long stub_stack)
 			       __func__, errno);
 			goto out_kill;
 		}
-	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
+	} while (!WIFEXITED(status));
+
+	pid = tramp_data.pid;
+	if (pid < 0) {
+		printk(UM_KERN_ERR "%s : second clone failed, errno = %d\n",
+		       __func__, -pid);
+		return pid;
+	}
+
+	do {
+		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
+		if (n < 0) {
+			err = -errno;
+			printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
+			       __func__, errno);
+			goto out_kill;
+		}
+	} while (WIFEXITED(status) && (WSTOPSIG(status) == SIGALRM));
 
 	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
 		err = -EINVAL;