Message ID | 20240128163958.17421-1-safinaskar@zohomail.com |
---|---|
State | New |
Headers | show |
Series | [RFC] Add public function syscall_no_errno | expand |
On 28/01/24 13:39, Askar Safin wrote: > Hi! I want glibc to have function "syscall_no_errno" on Linux. > It should do the same "syscall" does, but it should not interpret > return value and set errno. This is useful for calling syscalls > such as getuid. I. e. now the user can call directly all syscalls > including getuid and similar. > > I add example patch. It is quick-and-dirty. I was unable to figure out > how to add function to headers. So, please, don't apply it as-is. > > I just want to know do you agree with my proposal. If yes, I will try > to write better patch. > > I will repeat: currently glibc is simply incomplete, because it > does not provide a way to call directly syscalls, such as getuid. > So the user have to craft assembly, which is very difficult. Indeed there some old syscalls where trying to issue them directly with syscall is problematic (like 'time' and 'brk' for some ABIs), but getuid is not one of them. Also, recent Linux kABI is trying to avoid such problematic interfaces to return the value as the return code and make the invalid value similar to all cases. So these are not very compeling reason to add a non-standard symbol to issue syscalls. CCing Rich, maybe he has a different view about this. > > The patch is against current master, i. e. ae49a7b29acc184b03c2a6bd6ac01b5e08efd54f > > -- > > diff --git a/misc/Versions b/misc/Versions > index d5b348e8..ad37a4c2 100644 > --- a/misc/Versions > +++ b/misc/Versions > @@ -71,7 +71,7 @@ libc { > # s* > sbrk; select; setdomainname; setfsent; sethostent; sethostid; sethostname; > setlogmask; setmntent; setregid; setreuid; setttyent; setusershell; sstk; > - stty; sync; syscall; syslog; > + stty; sync; syscall; syslog; syscall_no_errno; > > # t* > tdelete; tfind; truncate; tsearch; ttyslot; twalk; > diff --git a/posix/unistd.h b/posix/unistd.h > index 54d7d752..2f0f6e79 100644 > --- a/posix/unistd.h > +++ b/posix/unistd.h > @@ -1089,6 +1089,7 @@ extern void *sbrk (intptr_t __delta) __THROW; > In Mach, all system calls take normal arguments and always return an > error code (zero for success). */ > extern long int syscall (long int __sysno, ...) __THROW; > +extern long int syscall_no_errno (long int __sysno, ...) __THROW; > > #endif /* Use misc. */ > > diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list > index aac065e7..dac4d78e 100644 > --- a/sysdeps/unix/sysv/linux/syscall-names.list > +++ b/sysdeps/unix/sysv/linux/syscall-names.list > @@ -612,6 +612,7 @@ sys_epoll_create > sys_epoll_ctl > sys_epoll_wait > syscall > +syscall_no_errno > sysfs > sysinfo > syslog > diff --git a/sysdeps/unix/sysv/linux/syscall.c b/sysdeps/unix/sysv/linux/syscall.c > index 3cff1d97..481b18a4 100644 > --- a/sysdeps/unix/sysv/linux/syscall.c > +++ b/sysdeps/unix/sysv/linux/syscall.c > @@ -41,3 +41,20 @@ syscall (long int number, ...) > } > return r; > } > +long int > +syscall_no_errno (long int number, ...) > +{ > + va_list args; > + > + va_start (args, number); > + long int a0 = va_arg (args, long int); > + long int a1 = va_arg (args, long int); > + long int a2 = va_arg (args, long int); > + long int a3 = va_arg (args, long int); > + long int a4 = va_arg (args, long int); > + long int a5 = va_arg (args, long int); > + va_end (args); > + > + long int r = INTERNAL_SYSCALL_NCS_CALL (number, a0, a1, a2, a3, a4, a5); > + return r; > +} > diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist > index aea7848e..55d9dadd 100644 > --- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist > +++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist > @@ -1761,6 +1761,7 @@ GLIBC_2.2.5 sys_nerr D 0x4 > GLIBC_2.2.5 sys_sigabbrev D 0x200 > GLIBC_2.2.5 sys_siglist D 0x200 > GLIBC_2.2.5 syscall F > +GLIBC_2.2.5 syscall_no_errno F > GLIBC_2.2.5 sysconf F > GLIBC_2.2.5 sysctl F > GLIBC_2.2.5 sysinfo F > diff --git a/sysdeps/unix/sysv/linux/x86_64/syscall.S b/sysdeps/unix/sysv/linux/x86_64/syscall.S > index 43af8087..04483251 100644 > --- a/sysdeps/unix/sysv/linux/x86_64/syscall.S > +++ b/sysdeps/unix/sysv/linux/x86_64/syscall.S > @@ -26,6 +26,18 @@ > > > .text > +ENTRY (syscall_no_errno) > + movq %rdi, %rax /* Syscall number -> rax. */ > + movq %rsi, %rdi /* shift arg1 - arg5. */ > + movq %rdx, %rsi > + movq %rcx, %rdx > + movq %r8, %r10 > + movq %r9, %r8 > + movq 8(%rsp),%r9 /* arg6 is on the stack. */ > + syscall /* Do the system call. */ > + ret /* Return to caller. */ > + > +PSEUDO_END_NOERRNO (syscall_no_errno) > ENTRY (syscall) > movq %rdi, %rax /* Syscall number -> rax. */ > movq %rsi, %rdi /* shift arg1 - arg5. */
On Thu, Feb 01, 2024 at 02:53:44PM -0300, Adhemerval Zanella Netto wrote: > > > On 28/01/24 13:39, Askar Safin wrote: > > Hi! I want glibc to have function "syscall_no_errno" on Linux. > > It should do the same "syscall" does, but it should not interpret > > return value and set errno. This is useful for calling syscalls > > such as getuid. I. e. now the user can call directly all syscalls > > including getuid and similar. > > > > I add example patch. It is quick-and-dirty. I was unable to figure out > > how to add function to headers. So, please, don't apply it as-is. > > > > I just want to know do you agree with my proposal. If yes, I will try > > to write better patch. > > > > I will repeat: currently glibc is simply incomplete, because it > > does not provide a way to call directly syscalls, such as getuid. > > So the user have to craft assembly, which is very difficult. > > Indeed there some old syscalls where trying to issue them directly with > syscall is problematic (like 'time' and 'brk' for some ABIs), but getuid > is not one of them. Also, recent Linux kABI is trying to avoid such > problematic interfaces to return the value as the return code and make > the invalid value similar to all cases. So these are not very compeling > reason to add a non-standard symbol to issue syscalls. > > CCing Rich, maybe he has a different view about this. That would be my view too. In general, it's rarely useful to make a raw syscall() to begin with. For the majority of syscalls that are ingredients in implementing standard or almost-standard functions provided by libc, going around libc to call it directly risks creating inconsistent state (brk), having inconsistent types (stat, etc.), missing fallback cases (direct use of time64 syscalls), etc. and really should not be encouraged by making unneeded new ways to do it. Indeed the only syscalls I'm aware of for which there's ambiguity about the result are of this sort. I don't even think brk is an affected one; values >=4095UL are not valid brk boundaries. Similarly, I don't think Linux supports setting system clock to before the epoch (and doing so would not be accurate anyway), so time is arguably not affected either. Even if it were, it's superseded by clock_gettime or gettimeofday, which don't have the problem, and might not even be available on future archs. There is also fcntl with F_GETOWN, which was not mentioned above, but it is superseded by F_GETOWN_EX, which the libc fcntl function automatically uses when available instead of using F_GETOWN directly. So I don't think it has modern relevance either. In any case, back to my original point, the use case for syscall() should be seen as calling newly-added extensions that don't yet have, or that aren't suitable to have, libc functions for them. Use of it as a means to "poke behind libc" should not be encouraged, as that tends to break things. If someone really wants to write code that's independent of libc -- like to run in a vforked child or CLONE_VM context or something without a valid thread pointer/TLS state/etc. -- it can't be using libc at all, and that includes the libc syscall() function (which not only may write errno, but might inspect TLS to determine how to make the syscall). For this level of "raw" syscall access, you need to write your own asm. Rich
Hi, Rich and Adhemerval! ---- On Thu, 01 Feb 2024 21:53:44 +0400 Adhemerval Zanella Netto wrote --- > Indeed there some old syscalls where trying to issue them directly with > syscall is problematic (like 'time' and 'brk' for some ABIs), but getuid > is not one of them. It *is* one of them! Keep in mind that Linux supports 32-bit uids. Run this code as root as 32-bit i386 binary (my letter continues after code). It is okay to run it on 64-bit amd64 kernel, you just have to make sure the binary itself is compiled as i386 =*=*=*=*= #if !defined(__i386__) #error #endif #include <stdint.h> #include <stdio.h> #include <sys/syscall.h> #include <unistd.h> int main (void) { // 4294967286 is (2^32)-10 uint32_t a = 4294967286U; if (syscall (SYS_setuid32, a) == -1) { perror("setuid"); return 1; } uint32_t b = syscall (SYS_getuid32); // Now b is equal to (uint32_t)-1 instead of wanted 4294967286 (i. e. (uint32_t)-10) printf("%u (wanted)\n", a); printf("%u (got)\n", b); return 0; } =*=*=*=*= (Also, when I said "getuid", I meant "SYS_getuid32".) I see this output: =*=*=*=*= 4294967286 (wanted) 4294967295 (got) =*=*=*=*= So, yes, function "syscall" is incompatible with SYS_getuid32. I'm nearly sure the same is true about getpid. Rich: > If someone really wants to write code that's > independent of libc -- like to run in a vforked child or CLONE_VM This is another use case I want to support. I. e. I want to have portable function (i. e. independent of arch) for issuing syscall in such case. And I don't want to write assembly. > but might inspect TLS to > determine how to make the syscall Okay, so syscall_no_errno should not do this. I. e. it should always do some way to make syscall, which always works, even if it is slow. For i386 it is "int 0x80" as well as I understand. Also, even if we read TLS to determine how to make syscall, what will go wrong? Child shares TLS with its parent after CLONE_VM, so we simply will read parent's TLS
On Thu, Feb 01, 2024 at 11:32:41PM +0400, Askar Safin wrote: > Hi, Rich and Adhemerval! > > ---- On Thu, 01 Feb 2024 21:53:44 +0400 Adhemerval Zanella Netto wrote --- > > Indeed there some old syscalls where trying to issue them directly with > > syscall is problematic (like 'time' and 'brk' for some ABIs), but getuid > > is not one of them. > It *is* one of them! > > Keep in mind that Linux supports 32-bit uids. > > Run this code as root as 32-bit i386 binary (my letter continues > after code). It is okay to run it on 64-bit amd64 kernel, you just > have to make sure the binary itself is compiled as i386 > =*=*=*=*= > #if !defined(__i386__) > #error > #endif > > #include <stdint.h> > #include <stdio.h> > #include <sys/syscall.h> > #include <unistd.h> > > int > main (void) > { > // 4294967286 is (2^32)-10 > uint32_t a = 4294967286U; > if (syscall (SYS_setuid32, a) == -1) > { > perror("setuid"); > return 1; > } > > uint32_t b = syscall (SYS_getuid32); > > // Now b is equal to (uint32_t)-1 instead of wanted 4294967286 (i. e. (uint32_t)-10) > printf("%u (wanted)\n", a); > printf("%u (got)\n", b); > return 0; > } > =*=*=*=*= > > (Also, when I said "getuid", I meant "SYS_getuid32".) > > I see this output: > =*=*=*=*= > 4294967286 (wanted) > 4294967295 (got) > =*=*=*=*= > > So, yes, function "syscall" is incompatible with SYS_getuid32. OK, but this falls under "poking around behind libc's back". If you call getuid() you'll find you get the right answer (at least on musl; not sure about glibc). > I'm nearly sure the same is true about getpid. It's not. PIDs are a 29- or 30-bit space (which is a matter of a bug I currently have open) on Linux. At least the high two bits are banned from ever being usable as a consequence of the futex interface. > Rich: > > If someone really wants to write code that's > > independent of libc -- like to run in a vforked child or CLONE_VM > > This is another use case I want to support. I. e. I want to > have portable function (i. e. independent of arch) for issuing > syscall in such case. And I don't > want to write assembly. Well that's not something syscall() gives you and it's not something syscall_no_errno() would reliably give you either unless it was specifically documented to be callable from such a context. If that's what you want, maybe that's what should be discussed. > > but might inspect TLS to > > determine how to make the syscall > > Okay, so syscall_no_errno should not do this. I. e. it should > always do some way to make syscall, which always works, > even if it is slow. For i386 it is "int 0x80" as well as I understand. > > Also, even if we read TLS to determine how to make syscall, > what will go wrong? Child shares TLS with its parent after CLONE_VM, > so we simply will read parent's TLS At least on glibc, you can pass clone flags that setup a new thread pointer in the child, which you may be intending to use for your own purposes in 'bare, no libc' code in the child. On musl this is not possible; the flags are rejected and you have to write your own asm to call clone if you want that. In any case, "what will go wrong?" is not the right question. The question is whether there's a contract for the thing you want to do to work, and there isn't. Rich
On 01/02/24 17:16, dalias wrote: > On Thu, Feb 01, 2024 at 11:32:41PM +0400, Askar Safin wrote: >> Hi, Rich and Adhemerval! >> >> ---- On Thu, 01 Feb 2024 21:53:44 +0400 Adhemerval Zanella Netto wrote --- >> > Indeed there some old syscalls where trying to issue them directly with >> > syscall is problematic (like 'time' and 'brk' for some ABIs), but getuid >> > is not one of them. >> It *is* one of them! >> >> Keep in mind that Linux supports 32-bit uids. >> >> Run this code as root as 32-bit i386 binary (my letter continues >> after code). It is okay to run it on 64-bit amd64 kernel, you just >> have to make sure the binary itself is compiled as i386 >> =*=*=*=*= >> #if !defined(__i386__) >> #error >> #endif >> >> #include <stdint.h> >> #include <stdio.h> >> #include <sys/syscall.h> >> #include <unistd.h> >> >> int >> main (void) >> { >> // 4294967286 is (2^32)-10 >> uint32_t a = 4294967286U; >> if (syscall (SYS_setuid32, a) == -1) >> { >> perror("setuid"); >> return 1; >> } >> >> uint32_t b = syscall (SYS_getuid32); >> >> // Now b is equal to (uint32_t)-1 instead of wanted 4294967286 (i. e. (uint32_t)-10) >> printf("%u (wanted)\n", a); >> printf("%u (got)\n", b); >> return 0; >> } >> =*=*=*=*= >> >> (Also, when I said "getuid", I meant "SYS_getuid32".) >> >> I see this output: >> =*=*=*=*= >> 4294967286 (wanted) >> 4294967295 (got) >> =*=*=*=*= >> >> So, yes, function "syscall" is incompatible with SYS_getuid32. > > OK, but this falls under "poking around behind libc's back". If you > call getuid() you'll find you get the right answer (at least on musl; > not sure about glibc). It works as expected on glibc as well. > >> I'm nearly sure the same is true about getpid. > > It's not. PIDs are a 29- or 30-bit space (which is a matter of a bug I > currently have open) on Linux. At least the high two bits are banned > from ever being usable as a consequence of the futex interface. > >> Rich: >>> If someone really wants to write code that's >>> independent of libc -- like to run in a vforked child or CLONE_VM >> >> This is another use case I want to support. I. e. I want to >> have portable function (i. e. independent of arch) for issuing >> syscall in such case. And I don't >> want to write assembly. > > Well that's not something syscall() gives you and it's not something > syscall_no_errno() would reliably give you either unless it was > specifically documented to be callable from such a context. If that's > what you want, maybe that's what should be discussed. There is another potential issue where you have multiple syscall with slight different kABI (like the ones that have 64 bit arguments on 32 bit architectures) and, at least for glibc, where the user exported argument does not really match the kernel (stat/fstat/etc.). > >>> but might inspect TLS to >>> determine how to make the syscall >> >> Okay, so syscall_no_errno should not do this. I. e. it should >> always do some way to make syscall, which always works, >> even if it is slow. For i386 it is "int 0x80" as well as I understand. >> >> Also, even if we read TLS to determine how to make syscall, >> what will go wrong? Child shares TLS with its parent after CLONE_VM, >> so we simply will read parent's TLS > > At least on glibc, you can pass clone flags that setup a new thread > pointer in the child, which you may be intending to use for your own > purposes in 'bare, no libc' code in the child. On musl this is not > possible; the flags are rejected and you have to write your own asm to > call clone if you want that. > > In any case, "what will go wrong?" is not the right question. The > question is whether there's a contract for the thing you want to do to > work, and there isn't. And I am not very found of this possible slight different contract for a non-standard interface.
Hi, Rich and Adhemerval! ---- On Fri, 02 Feb 2024 00:16:26 +0400 dalias wrote --- > OK, but this falls under "poking around behind libc's back". Function "syscall" exists in glibc. So it is assumed to be useful. Thus it is okay to have "syscall_no_errno", too. "syscall" in its current form has a problem: you cannot correctly call SYS_getuid32. Thus there is need for "syscall_no_errno". So I will write next version of my patch. I hope in several days, when I have time. "syscall_no_errno" will be simple wrapper for "INTERNAL_SYSCALL_NCS_CALL". > Well that's not something syscall() gives you and it's not something > syscall_no_errno() would reliably give you either unless it was > specifically documented to be callable from such a context. If that's > what you want, maybe that's what should be discussed. I did some research. I have read all places where glibc calls "clone". And I see that in all these places if CLONE_VM specified, then CLONE_SETTLS or CLONE_VFORK is specified. In fact, there is 2 places only, where glibc calls clone (in Linux x86_64). One is implementation of pthread_create. We pass CLONE_VM and CLONE_SETTLS. Thus (as well as I understand) TLS is set up in child right from the beginning. There is no time gap, when TLS is not initialized yet. Second place is implementation of posix_spawn. We pass CLONE_VM and CLONE_VFORK. CLONE_VFORK means that parent is suspended, and thus it is okay to access its TLS. Thus glibc absolutely always (even internally!) maintains invariant "TLS is safe to access". I. e. glibc itself does not need a way to perform syscalls when TLS is not set up. Thus if I want syscall_no_errno to provide such guarantee, this will be absolutely unique guarantee, which is not needed even by glibc authors themselves. So I think this will be very hard to provide it, and benefits will be small. Yes, small time gap when TLS is not initialized, still exists. Before "main". But in this case we have one thread only, so everything is simple. For all these reasons I think that guarantee "you may call syscall_no_errno even after clone(CLONE_VM) without CLONE_VFORK and without CLONE_SETTLS" should not be provided. (Correct me if I wrong.) So I go back to my original proposal: add function, which will be able to call directly syscalls, such as SYS_getuid32. Adhemerval Zanella Netto: > There is another potential issue where you have multiple syscall > with slight different kABI In fact my original use case is so: I want to parse true kernel ABI from files /sys/kernel/debug/tracing/events/syscalls/sys_enter_*/format and then generate a library based on the ABI. Such library will provide syscall wrappers with correct types. These wrappers will call functions "syscall" and "syscall_no_errno". (In fact, this will be Rust library, not C one, but I don't think this is important for our discussion. You may say: "Then why you want to add syscall_no_errno to upstream glibc?" Well, because I want it to be available to everyone. To Rust programmers, to C programmers and to everyone else. Without the need to write assembly.) So, I will read proper kernel ABI from /sys/kernel/..., so I will get ABI right. > at least for glibc, where the user > exported argument does not really match the kernel (stat/fstat/etc.). I will use Linux headers, so all will be okay. (Well, correction: I will use Rust's crate "linux-raw-sys", which is based on Linux headers.) You may say: "Well, I still think you should find Rust library, which does what you want, and stop bothering upstream glibc". I will answer: I don't trust Rust libs! Yes, there exist multiple Rust crates, which provide something similar to syscall_no_errno. They are implemented using hand-coded assembly. Why should I trust that they got this assembly correct? On the other hand, glibc already has high quality INTERNAL_SYSCALL_NCS_CALL, which I trust. It is more likely to be correct, than all these buggy Rust crates. We just need to export it. So I propose to export simple wrapper for INTERNAL_SYSCALL_NCS_CALL called "syscall_no_errno". And everybody will use it. C devs, Rust devs, etc. Big benefits for everybody. For low cost on glibc side. Also, I have a question. It seems that function "syscall" uses TLS and nothing else. This means that it is okay to call it in a child, created using "clone", assuming that one of these holds: - We didn't specify CLONE_VM - We specified CLONE_VM and CLONE_VFORK (thus parent is suspended and it is okay to use its TLS) So may I also write a patch for documentation, which will document this guarantee? It surely already holds, because glibc uses it in implementation of posix_spawn. I just want to document it, and this will mean that now it will hold in the future. Moreover, such guarantee holds if we call clone using this code without specifiying new stack: if (syscall(SYS_clone, ...) == 0){ /* we are in child */ }
sounds like you want this project: https://chromium.googlesource.com/linux-syscall-support/ -mike
---- On Wed, 07 Feb 2024 05:57:49 +0400 Mike Frysinger wrote --- > sounds like you want this project: > https://chromium.googlesource.com/linux-syscall-support/ This project has exactly the problem I'm talking abort! Exactly problem I'm trying to fix using syscall_no_errno! linux-syscall-support always unconditionally checks syscall result and sets errno based on it: https://chromium.googlesource.com/linux-syscall-support/+/ed31caa60f20a4f6569883b2d752ef7522de51e0/linux_syscall_support.h#2175 You can verify that this happens in all code paths by carefully reading the source. Fortunately, the project doesn't provide "getuid" at all. I think this is because authors aware of the problem. Also, the project is simply buggy. Look at this (my!) commit: https://chromium.googlesource.com/linux-syscall-support/+/e1e7b0ad8ee99a875b272c8e33e308472e897660 If I didn't notice this bug and didn't fix it, the bug would stay forever! So, after my expirence with this project I conclude: I don't trust such projects. I. e. small project claiming to support performing direct syscalls. I don't trust C libraries for this purpose and I don't trust Rust libraries for this purpose. The only libraries I trust are glibc and musl. But both don't export anything like syscall_no_errno. And this is what I'm trying to change. I'm aware of existance of "nolibc" project inside Linux source tree. But (based on my expirence with linux-syscall-support) I don't trust it, either. But... well, I *will* trust some of these projects if I carefully read sources. So, yes, I can fix problem for me personally: just carefully read sources of linux-syscall-support and use it! Or some similar C or Rust library. Moreover: my task is even simpler! The only architectures I want to support in my project are x86_64 and i386. So I can simply copy needed assembly from glibc sources and all will be done! But I'm not trying to fix the problem for me personally. I want to fix it for everyone! I want anyone to be able just to use universally available trusted function from glibc. Without the need for assembly or third party projects such as linux-syscall-support. glibc already provides "syscall", thus it should provide "syscall_no_errno", too! Askar Safin https://types.pl/@safinaskar
On Wed, Feb 07, 2024 at 04:59:54AM +0400, Askar Safin wrote: > Hi, Rich and Adhemerval! > > ---- On Fri, 02 Feb 2024 00:16:26 +0400 dalias wrote --- > > OK, but this falls under "poking around behind libc's back". > > Function "syscall" exists in glibc. So it is assumed to be useful. > Thus it is okay to have "syscall_no_errno", too. "syscall" in > its current form has a problem: you cannot correctly call > SYS_getuid32. Thus there is need for "syscall_no_errno". > > So I will write next version of my patch. I hope in several days, > when I have time. > "syscall_no_errno" will be simple wrapper for "INTERNAL_SYSCALL_NCS_CALL". > > > Well that's not something syscall() gives you and it's not something > > syscall_no_errno() would reliably give you either unless it was > > specifically documented to be callable from such a context. If that's > > what you want, maybe that's what should be discussed. > > I did some research. I have read all places where glibc calls "clone". > And I see that in all these places if CLONE_VM specified, then > CLONE_SETTLS or CLONE_VFORK is specified. > > In fact, there is 2 places only, where glibc calls clone (in Linux x86_64). > > One is implementation of pthread_create. We pass CLONE_VM and > CLONE_SETTLS. Thus (as well as I understand) TLS is set up > in child right from the beginning. > There is no time gap, when TLS is not initialized yet. > > Second place is implementation of posix_spawn. We pass CLONE_VM > and CLONE_VFORK. CLONE_VFORK means that parent is suspended, > and thus it is okay to access its TLS. > > Thus glibc absolutely always (even internally!) maintains invariant > "TLS is safe to access". I. e. glibc itself does not need a way to perform > syscalls when TLS is not set up. You're missing the point. Of course glibc maintains that invariant because it's providing a consistent platform to applications. If you have application code that uses clone() itself, it may not be meeting those invariants necessary to be able to call into libc. > In fact my original use case is so: I want to parse true kernel ABI > from files /sys/kernel/debug/tracing/events/syscalls/sys_enter_*/format > and then generate a library based on the ABI. > > Such library will provide syscall wrappers with correct types. These > wrappers will call functions "syscall" and "syscall_no_errno". > > (In fact, this will be Rust library, not C one, but I don't think > this is important for our discussion. You may say: "Then why > you want to add syscall_no_errno to upstream glibc?" > Well, because I want it to be available to everyone. To Rust > programmers, to C programmers and to everyone else. > Without the need to write assembly.) > > So, I will read proper kernel ABI from /sys/kernel/..., so > I will get ABI right. Can you clarify *why* you want to do that? Making raw syscalls like this is ill-advised and *will* run into mismatched ABI for certain types (where the kernel and user types differ), poor compatibility with kernel versions that aren't the same you expected, time64 issues on 32-bit archs, etc. Based on our conversations in this thread, I do not think you're prepared to handle all that right. It most certainly cannot be handled in an automated way by parsing the data you have in mind; doing it right requires knowing semantic relationships between syscalls. It sounds like you plan to hand Rust programmers a big footgun. Rich
---- On Thu, 08 Feb 2024 00:59:02 +0400 dalias wrote --- > You're missing the point. Of course glibc maintains that invariant > because it's providing a consistent platform to applications. If you > have application code that uses clone() itself, it may not be meeting > those invariants necessary to be able to call into libc. Let me try to explain in other way. I tried to understand what guarantees of "syscall" and "syscall_no_errno" functions will be easy for glibc maintainers to provide, and what will be hard. Let's begin with guarantee "it is okay to call 'syscall_no_errno', when TLS is possibly not initialized properly, for example in child created by clone(CLONE_VM)". Such guarantee will be hard to provide, because it seems that it is not provided currently. I. e. additional work will be needed to provide it. It seems to be not provided, because glibc nearly never uses it. glibc seems to never use it, because in glibc TLS is nearly always initialized. Now let's consider another possible guarantee "it is okay to call 'syscall' and 'syscall_no_errno' in child created by clone(CLONE_VM | CLONE_VFORK)". It will be easy to provide, because (as of current master) it is already provided, but not documented. I see that it is provided, because I see that glibc internally uses this guarantee. glibc uses it in spawn implementation. glibc creates child in posix_spawn using CLONE_VM | CLONE_VFORK. And then issues syscalls using something very close to "syscall". So it is very easy to provide such guarantee. We just need to document it. If it is broken, glibc developers will notice this, because they depend on this guarantee in spawn implementation. So I don't propose to add that guarantee about TLS. But I do propose to add (i. e. to document) guarantee about CLONE_VM | CLONE_VFORK. > Can you clarify *why* you want to do that? As I already told I want to write that Rust library. I don't want to publish it. Just for personal use. I need it to expirement with Linux. I. e. no any serious use. Just a playground. I want to have full control over what syscalls I issue. And I want to send syscall_no_errno to upstream glibc (and ideally to upstream musl, too), because other developers may have similar needs. I. e. because they sometimes want to have full control over syscalls they issue. Currently, if someone wants to issue syscall, such as SYS_getuid32 on i386, they must either use assembly (which is hard to do right), either use some third-party library. But I don't trust third-party libraries, as I already told in this thread. So I suggest other people not to use them. For example, there exists https://chromium.googlesource.com/linux-syscall-support/ , which is intended to be way to make raw syscalls. But as I said previously, it doesn't provide way to call getuid, and also it had a bug, which was not fixed until *I* noticed it. So I propose to export battle-tested trusted glibc implementation as a public symbol. And everyone will be able to use it. > Making raw syscalls like > this is ill-advised and *will* run into mismatched ABI for certain > types (where the kernel and user types differ) It will not, because I will use Rust crate "linux-raw-sys", which is based on Linux uapi headers, i. e. headers from dir "include/uapi" from Linux source tree. > poor compatibility > with kernel versions that aren't the same you expected What you mean? Newer kernel versions are ABI compatible with older ones. And yes, if you write program for newer kernel and try to run it on older, you may get some issues. But the same applies to usual programs written with glibc and musl. Yes, glibc and musl know how to fail back to older syscalls. So, what? I can manually write such fail back code if needed. Again: I'm aware of all these issues. Of course, I know that issuing syscalls directly is harder than using libc wrappers. > time64 issues > on 32-bit archs, etc. As I said, I will use prototypes parsed from /sys/... with types got from Linux uapi headers. This will remove all time64 and similar problems. Yes, this will mean that I will have to manually write fail back code, i. e. "try time64, if it fails, then do time32". But I'm aware about this. > It most certainly cannot be handled > in an automated way by parsing the data you have in mind As I said, it *can* be done it automated way. Just use prototypes from /sys/... and type definitions from uapi headers. Feel free to steal this idea and use it in musl. :) Also you may consider this project https://syscalls.mebeim.net/ instead of parsing /sys/... . They get prototypes from kernel ELF binary. And, yes, I know that when I see, say, "pid_t" in /sys/... , this really means __kernel_pid_t from uapi headers. (Also I plan to parse files from strace project, such as this: https://github.com/strace/strace/blob/eb6014c510d663938db331fef5ac4ef78fdcf583/src/linux/i386/syscallent.h . Because they mark functions, which cannot fail, as "NF". So I will know when I should check errors, and when not.) Of course, this process will produce not-so-usable functions. So I then will manually write for them wrappers. For example, this process may automatically produce this Rust prototype for "write": fn write(fd: c_int, buf: *const c_void, len: usize) -> Result<isize, Errno> And then I will futher manually wrap it and will make something like this: fn write(fd: c_int, buf: &[u8]) -> Result<usize, Errno> And of course, all these functions will be marked as "unsafe". I have no plans for creating safe (in Rust sense) interface for Linux syscalls. I do not plan to publish code, but if you want, I can show it to you, when it is ready. > doing it > right requires knowing semantic relationships between syscalls. Knowing semantic relationship may be required if I want to add extra type data not present in /sys/... . For example if I want to have separated type for "int", which means file descriptor, and for other "int"s. Yes, I know that this cannot be done in automated way. Yes, I know that generated code will simply have "int"s everywhere and I will have to write manual wrapper if I want something more type-safe. Also semantic relationship may be needed if I want to present safe interface (in Rust sense). I have no such plans. -- Askar Safin https://types.pl/@safinaskar
diff --git a/misc/Versions b/misc/Versions index d5b348e8..ad37a4c2 100644 --- a/misc/Versions +++ b/misc/Versions @@ -71,7 +71,7 @@ libc { # s* sbrk; select; setdomainname; setfsent; sethostent; sethostid; sethostname; setlogmask; setmntent; setregid; setreuid; setttyent; setusershell; sstk; - stty; sync; syscall; syslog; + stty; sync; syscall; syslog; syscall_no_errno; # t* tdelete; tfind; truncate; tsearch; ttyslot; twalk; diff --git a/posix/unistd.h b/posix/unistd.h index 54d7d752..2f0f6e79 100644 --- a/posix/unistd.h +++ b/posix/unistd.h @@ -1089,6 +1089,7 @@ extern void *sbrk (intptr_t __delta) __THROW; In Mach, all system calls take normal arguments and always return an error code (zero for success). */ extern long int syscall (long int __sysno, ...) __THROW; +extern long int syscall_no_errno (long int __sysno, ...) __THROW; #endif /* Use misc. */ diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list index aac065e7..dac4d78e 100644 --- a/sysdeps/unix/sysv/linux/syscall-names.list +++ b/sysdeps/unix/sysv/linux/syscall-names.list @@ -612,6 +612,7 @@ sys_epoll_create sys_epoll_ctl sys_epoll_wait syscall +syscall_no_errno sysfs sysinfo syslog diff --git a/sysdeps/unix/sysv/linux/syscall.c b/sysdeps/unix/sysv/linux/syscall.c index 3cff1d97..481b18a4 100644 --- a/sysdeps/unix/sysv/linux/syscall.c +++ b/sysdeps/unix/sysv/linux/syscall.c @@ -41,3 +41,20 @@ syscall (long int number, ...) } return r; } +long int +syscall_no_errno (long int number, ...) +{ + va_list args; + + va_start (args, number); + long int a0 = va_arg (args, long int); + long int a1 = va_arg (args, long int); + long int a2 = va_arg (args, long int); + long int a3 = va_arg (args, long int); + long int a4 = va_arg (args, long int); + long int a5 = va_arg (args, long int); + va_end (args); + + long int r = INTERNAL_SYSCALL_NCS_CALL (number, a0, a1, a2, a3, a4, a5); + return r; +} diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist index aea7848e..55d9dadd 100644 --- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist @@ -1761,6 +1761,7 @@ GLIBC_2.2.5 sys_nerr D 0x4 GLIBC_2.2.5 sys_sigabbrev D 0x200 GLIBC_2.2.5 sys_siglist D 0x200 GLIBC_2.2.5 syscall F +GLIBC_2.2.5 syscall_no_errno F GLIBC_2.2.5 sysconf F GLIBC_2.2.5 sysctl F GLIBC_2.2.5 sysinfo F diff --git a/sysdeps/unix/sysv/linux/x86_64/syscall.S b/sysdeps/unix/sysv/linux/x86_64/syscall.S index 43af8087..04483251 100644 --- a/sysdeps/unix/sysv/linux/x86_64/syscall.S +++ b/sysdeps/unix/sysv/linux/x86_64/syscall.S @@ -26,6 +26,18 @@ .text +ENTRY (syscall_no_errno) + movq %rdi, %rax /* Syscall number -> rax. */ + movq %rsi, %rdi /* shift arg1 - arg5. */ + movq %rdx, %rsi + movq %rcx, %rdx + movq %r8, %r10 + movq %r9, %r8 + movq 8(%rsp),%r9 /* arg6 is on the stack. */ + syscall /* Do the system call. */ + ret /* Return to caller. */ + +PSEUDO_END_NOERRNO (syscall_no_errno) ENTRY (syscall) movq %rdi, %rax /* Syscall number -> rax. */ movq %rsi, %rdi /* shift arg1 - arg5. */