From patchwork Thu Mar 21 12:14:49 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914391 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kvN2VRrz1yWs for ; Thu, 21 Mar 2024 23:16:00 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHKd-0007wR-5W; Thu, 21 Mar 2024 12:15:39 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKB-0007t1-6P for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:12 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id BAF6540EB6; Thu, 21 Mar 2024 12:15:10 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 1/9] ipc: check checkpoint_restore_ns_capable() to modify C/R proc files Date: Thu, 21 Mar 2024 13:14:49 +0100 Message-ID: <20240321121457.362921-2-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Michal Clapinski BugLink: https://bugs.launchpad.net/bugs/2058485 This commit removes the requirement to be root to modify sem_next_id, msg_next_id and shm_next_id and checks checkpoint_restore_ns_capable instead. Since those files are specific to the IPC namespace, there is no reason they should require root privileges. This is similar to ns_last_pid, which also only checks checkpoint_restore_ns_capable. [akpm@linux-foundation.org: ipc/ipc_sysctl.c needs capability.h for checkpoint_restore_ns_capable()] Link: https://lkml.kernel.org/r/20210916163717.3179496-1-mclapinski@google.com Signed-off-by: Michal Clapinski Reviewed-by: Davidlohr Bueso Reviewed-by: Manfred Spraul Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 5563cabdde7ee53c34ec7e5e0283bfcc9a1bc893) Signed-off-by: Thibault Ferrante --- ipc/ipc_sysctl.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 3f312bf2b1163..345e4d673e61e 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "util.h" @@ -104,6 +105,19 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, return ret; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos) +{ + struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns; + + if (write && !checkpoint_restore_ns_capable(user_ns)) + return -EPERM; + + return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + #else #define proc_ipc_doulongvec_minmax NULL #define proc_ipc_dointvec NULL @@ -111,6 +125,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, #define proc_ipc_dointvec_minmax_orphans NULL #define proc_ipc_auto_msgmni NULL #define proc_ipc_sem_dointvec NULL +#ifdef CONFIG_CHECKPOINT_RESTORE +#define proc_ipc_dointvec_minmax_checkpoint_restore NULL +#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif int ipc_mni = IPCMNI; @@ -198,8 +215,8 @@ static struct ctl_table ipc_kern_table[] = { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .mode = 0666, + .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -207,8 +224,8 @@ static struct ctl_table ipc_kern_table[] = { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .mode = 0666, + .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -216,8 +233,8 @@ static struct ctl_table ipc_kern_table[] = { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .mode = 0666, + .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, From patchwork Thu Mar 21 12:14:50 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914394 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kx40XWqz1yWs for ; Thu, 21 Mar 2024 23:17:28 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHMG-0008RY-W8; Thu, 21 Mar 2024 12:17:21 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKC-0007tB-5r for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:12 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id C1004445A4; Thu, 21 Mar 2024 12:15:11 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 2/9] ipc/ipc_sysctl.c: remove fallback for !CONFIG_PROC_SYSCTL Date: Thu, 21 Mar 2024 13:14:50 +0100 Message-ID: <20240321121457.362921-3-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Manfred Spraul BugLink: https://bugs.launchpad.net/bugs/2058485 Compilation of ipc/ipc_sysctl.c is controlled by obj-$(CONFIG_SYSVIPC_SYSCTL) [see ipc/Makefile] And CONFIG_SYSVIPC_SYSCTL depends on SYSCTL [see init/Kconfig] An SYSCTL is selected by PROC_SYSCTL. [see fs/proc/Kconfig] Thus: #ifndef CONFIG_PROC_SYSCTL in ipc/ipc_sysctl.c is impossible, the fallback can be removed. Link: https://lkml.kernel.org/r/20210918145337.3369-1-manfred@colorfullife.com Signed-off-by: Manfred Spraul Reviewed-by: "Eric W. Biederman" Acked-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 0e9beb8a96f21a6df1579cb3a679e150e3269d80) Signed-off-by: Thibault Ferrante --- ipc/ipc_sysctl.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 345e4d673e61e..f101c171753f6 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -23,7 +23,6 @@ static void *get_ipc(struct ctl_table *table) return which; } -#ifdef CONFIG_PROC_SYSCTL static int proc_ipc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -118,18 +117,6 @@ static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table, } #endif -#else -#define proc_ipc_doulongvec_minmax NULL -#define proc_ipc_dointvec NULL -#define proc_ipc_dointvec_minmax NULL -#define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_auto_msgmni NULL -#define proc_ipc_sem_dointvec NULL -#ifdef CONFIG_CHECKPOINT_RESTORE -#define proc_ipc_dointvec_minmax_checkpoint_restore NULL -#endif /* CONFIG_CHECKPOINT_RESTORE */ -#endif - int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; From patchwork Thu Mar 21 12:14:51 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914395 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kxB4WDKz1yWs for ; Thu, 21 Mar 2024 23:17:34 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHMJ-0008VU-PS; Thu, 21 Mar 2024 12:17:24 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKD-0007tE-1g for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:13 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id AA4EB445A6; Thu, 21 Mar 2024 12:15:12 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 3/9] ipc: Store mqueue sysctls in the ipc namespace Date: Thu, 21 Mar 2024 13:14:51 +0100 Message-ID: <20240321121457.362921-4-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Alexey Gladkov BugLink: https://bugs.launchpad.net/bugs/2058485 Right now, the mqueue sysctls take ipc namespaces into account in a rather hacky way. This works in most cases, but does not respect the user namespace. Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/* parametres. This poses a problem in the rootless containers. To solve this I changed the implementation of the mqueue sysctls just like some other sysctls. So far, the changes do not provide additional access to files. This will be done in a future patch. v3: * Don't implemenet set_permissions to keep the current behavior. v2: * Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not specified. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b0ccbb2489119f1f20c737cf1930c3a9c4e4243a.1644862280.git.legion@kernel.org Signed-off-by: Eric W. Biederman (cherry picked from commit dc55e35f9e810f23dd69cfdc91a3d636023f57a2) Signed-off-by: Thibault Ferrante --- include/linux/ipc_namespace.h | 16 +++-- ipc/mq_sysctl.c | 121 ++++++++++++++++++---------------- ipc/mqueue.c | 10 ++- ipc/namespace.c | 6 ++ 4 files changed, 88 insertions(+), 65 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index acd2398317c28..9cb2ae3415e05 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include struct user_namespace; @@ -63,6 +64,9 @@ struct ipc_namespace { unsigned int mq_msg_default; unsigned int mq_msgsize_default; + struct ctl_table_set mq_set; + struct ctl_table_header *mq_sysctls; + /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; @@ -172,14 +176,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns) #ifdef CONFIG_POSIX_MQUEUE_SYSCTL -struct ctl_table_header; -extern struct ctl_table_header *mq_register_sysctl_table(void); +void retire_mq_sysctls(struct ipc_namespace *ns); +bool setup_mq_sysctls(struct ipc_namespace *ns); #else /* CONFIG_POSIX_MQUEUE_SYSCTL */ -static inline struct ctl_table_header *mq_register_sysctl_table(void) +static inline void retire_mq_sysctls(struct ipc_namespace *ns) { - return NULL; +} + +static inline bool setup_mq_sysctls(struct ipc_namespace *ns) +{ + return true; } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 72a92a08c848e..fbf6a8b93a265 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -9,39 +9,9 @@ #include #include -#ifdef CONFIG_PROC_SYSCTL -static void *get_mq(struct ctl_table *table) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} - -static int proc_mq_dointvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table mq_table; - memcpy(&mq_table, table, sizeof(mq_table)); - mq_table.data = get_mq(table); - - return proc_dointvec(&mq_table, write, buffer, lenp, ppos); -} - -static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table mq_table; - memcpy(&mq_table, table, sizeof(mq_table)); - mq_table.data = get_mq(table); - - return proc_dointvec_minmax(&mq_table, write, buffer, - lenp, ppos); -} -#else -#define proc_mq_dointvec NULL -#define proc_mq_dointvec_minmax NULL -#endif +#include +#include +#include static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; @@ -55,14 +25,14 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_queues_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "msg_max", .data = &init_ipc_ns.mq_msg_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, @@ -71,7 +41,7 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msgsize_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, @@ -80,7 +50,7 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msg_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, @@ -89,32 +59,73 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msgsize_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, {} }; -static struct ctl_table mq_sysctl_dir[] = { - { - .procname = "mqueue", - .mode = 0555, - .child = mq_sysctls, - }, - {} -}; +static struct ctl_table_set *set_lookup(struct ctl_table_root *root) +{ + return ¤t->nsproxy->ipc_ns->mq_set; +} -static struct ctl_table mq_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = mq_sysctl_dir, - }, - {} +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->ipc_ns->mq_set == set; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, }; -struct ctl_table_header *mq_register_sysctl_table(void) +bool setup_mq_sysctls(struct ipc_namespace *ns) { - return register_sysctl_table(mq_sysctl_root); + struct ctl_table *tbl; + + setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen); + + tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL); + if (tbl) { + int i; + + for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) { + if (tbl[i].data == &init_ipc_ns.mq_queues_max) + tbl[i].data = &ns->mq_queues_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msg_max) + tbl[i].data = &ns->mq_msg_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max) + tbl[i].data = &ns->mq_msgsize_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msg_default) + tbl[i].data = &ns->mq_msg_default; + + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default) + tbl[i].data = &ns->mq_msgsize_default; + else + tbl[i].data = NULL; + } + + ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl); + } + if (!ns->mq_sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->mq_set); + return false; + } + + return true; +} + +void retire_mq_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + tbl = ns->mq_sysctls->ctl_table_arg; + unregister_sysctl_table(ns->mq_sysctls); + retire_sysctl_set(&ns->mq_set); + kfree(tbl); } diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 089c34d0732cf..79b0079ee1acb 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -164,8 +164,6 @@ static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; -static struct ctl_table_header *mq_sysctl_table; - static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) { return container_of(inode, struct mqueue_inode_info, vfs_inode); @@ -1727,8 +1725,10 @@ static int __init init_mqueue_fs(void) if (mqueue_inode_cachep == NULL) return -ENOMEM; - /* ignore failures - they are not fatal */ - mq_sysctl_table = mq_register_sysctl_table(); + if (!setup_mq_sysctls(&init_ipc_ns)) { + pr_warn("sysctl registration failed\n"); + return -ENOMEM; + } error = register_filesystem(&mqueue_fs_type); if (error) @@ -1745,8 +1745,6 @@ static int __init init_mqueue_fs(void) out_filesystem: unregister_filesystem(&mqueue_fs_type); out_sysctl: - if (mq_sysctl_table) - unregister_sysctl_table(mq_sysctl_table); kmem_cache_destroy(mqueue_inode_cachep); return error; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 8970e5959e725..7bda9471d450b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -59,6 +59,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (err) goto fail_put; + err = -ENOMEM; + if (!setup_mq_sysctls(ns)) + goto fail_put; + sem_init_ns(ns); msg_init_ns(ns); shm_init_ns(ns); @@ -125,6 +129,8 @@ static void free_ipc_ns(struct ipc_namespace *ns) msg_exit_ns(ns); shm_exit_ns(ns); + retire_mq_sysctls(ns); + dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); From patchwork Thu Mar 21 12:14:52 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914396 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kxH66rYz1yWs for ; Thu, 21 Mar 2024 23:17:39 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHMN-00006U-3e; Thu, 21 Mar 2024 12:17:27 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKD-0007tK-SM for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:14 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id 71948445A7; Thu, 21 Mar 2024 12:15:13 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 4/9] ipc: Store ipc sysctls in the ipc namespace Date: Thu, 21 Mar 2024 13:14:52 +0100 Message-ID: <20240321121457.362921-5-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Alexey Gladkov BugLink: https://bugs.launchpad.net/bugs/2058485 The ipc sysctls are not available for modification inside the user namespace. Following the mqueue sysctls, we changed the implementation to be more userns friendly. So far, the changes do not provide additional access to files. This will be done in a future patch. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/be6f9d014276f4dddd0c3aa05a86052856c1c555.1644862280.git.legion@kernel.org Signed-off-by: Eric W. Biederman (cherry picked from commit 1f5c135ee509e89e0cc274333a65f73c62cb16e5) Signed-off-by: Thibault Ferrante --- include/linux/ipc_namespace.h | 21 ++++ ipc/ipc_sysctl.c | 189 ++++++++++++++++++++++------------ ipc/namespace.c | 4 + 3 files changed, 147 insertions(+), 67 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 9cb2ae3415e05..2c4ef98ae436c 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -67,6 +67,9 @@ struct ipc_namespace { struct ctl_table_set mq_set; struct ctl_table_header *mq_sysctls; + struct ctl_table_set ipc_set; + struct ctl_table_header *ipc_sysctls; + /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; @@ -191,4 +194,22 @@ static inline bool setup_mq_sysctls(struct ipc_namespace *ns) } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ + +#ifdef CONFIG_SYSVIPC_SYSCTL + +bool setup_ipc_sysctls(struct ipc_namespace *ns); +void retire_ipc_sysctls(struct ipc_namespace *ns); + +#else /* CONFIG_SYSVIPC_SYSCTL */ + +static inline void retire_ipc_sysctls(struct ipc_namespace *ns) +{ +} + +static inline bool setup_ipc_sysctls(struct ipc_namespace *ns) +{ + return true; +} + +#endif /* CONFIG_SYSVIPC_SYSCTL */ #endif diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index f101c171753f6..15210ac47e9e1 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -13,43 +13,22 @@ #include #include #include +#include #include "util.h" -static void *get_ipc(struct ctl_table *table) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} - -static int proc_ipc_dointvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); -} - -static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, +static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct ipc_namespace *ns = table->extra1; struct ctl_table ipc_table; + int err; memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); -} + ipc_table.extra1 = SYSCTL_ZERO; + ipc_table.extra2 = SYSCTL_ONE; -static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ipc_namespace *ns = current->nsproxy->ipc_ns; - int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); + err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); if (err < 0) return err; @@ -58,17 +37,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, return err; } -static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_doulongvec_minmax(&ipc_table, write, buffer, - lenp, ppos); -} - static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -87,11 +55,17 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { + struct ipc_namespace *ns = table->extra1; + struct ctl_table ipc_table; int ret, semmni; - struct ipc_namespace *ns = current->nsproxy->ipc_ns; + + memcpy(&ipc_table, table, sizeof(ipc_table)); + + ipc_table.extra1 = NULL; + ipc_table.extra2 = NULL; semmni = ns->sem_ctls[3]; - ret = proc_ipc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) ret = sem_check_semmni(current->nsproxy->ipc_ns); @@ -108,12 +82,18 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns; + struct ipc_namespace *ns = table->extra1; + struct ctl_table ipc_table; - if (write && !checkpoint_restore_ns_capable(user_ns)) + if (write && !checkpoint_restore_ns_capable(ns->user_ns)) return -EPERM; - return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); + memcpy(&ipc_table, table, sizeof(ipc_table)); + + ipc_table.extra1 = SYSCTL_ZERO; + ipc_table.extra2 = SYSCTL_INT_MAX; + + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } #endif @@ -121,27 +101,27 @@ int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; -static struct ctl_table ipc_kern_table[] = { +static struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, .maxlen = sizeof(init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmall", .data = &init_ipc_ns.shm_ctlall, .maxlen = sizeof(init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = proc_ipc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmmni", .data = &init_ipc_ns.shm_ctlmni, .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, @@ -151,15 +131,13 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -168,7 +146,7 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, @@ -186,7 +164,7 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, @@ -204,8 +182,6 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0666, .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", @@ -213,8 +189,6 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0666, .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", @@ -222,25 +196,106 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0666, .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_INT_MAX, }, #endif {} }; -static struct ctl_table ipc_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = ipc_kern_table, - }, - {} +static struct ctl_table_set *set_lookup(struct ctl_table_root *root) +{ + return ¤t->nsproxy->ipc_ns->ipc_set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->ipc_ns->ipc_set == set; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, }; +bool setup_ipc_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen); + + tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL); + if (tbl) { + int i; + + for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) { + if (tbl[i].data == &init_ipc_ns.shm_ctlmax) { + tbl[i].data = &ns->shm_ctlmax; + + } else if (tbl[i].data == &init_ipc_ns.shm_ctlall) { + tbl[i].data = &ns->shm_ctlall; + + } else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) { + tbl[i].data = &ns->shm_ctlmni; + + } else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) { + tbl[i].data = &ns->shm_rmid_forced; + tbl[i].extra1 = ns; + + } else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) { + tbl[i].data = &ns->msg_ctlmax; + + } else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) { + tbl[i].data = &ns->msg_ctlmni; + + } else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) { + tbl[i].data = &ns->msg_ctlmnb; + + } else if (tbl[i].data == &init_ipc_ns.sem_ctls) { + tbl[i].data = &ns->sem_ctls; + tbl[i].extra1 = ns; +#ifdef CONFIG_CHECKPOINT_RESTORE + } else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) { + tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; + tbl[i].extra1 = ns; + + } else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) { + tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; + tbl[i].extra1 = ns; + + } else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) { + tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; + tbl[i].extra1 = ns; +#endif + } else { + tbl[i].data = NULL; + } + } + + ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl); + } + if (!ns->ipc_sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->ipc_set); + return false; + } + + return true; +} + +void retire_ipc_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + tbl = ns->ipc_sysctls->ctl_table_arg; + unregister_sysctl_table(ns->ipc_sysctls); + retire_sysctl_set(&ns->ipc_set); + kfree(tbl); +} + static int __init ipc_sysctl_init(void) { - register_sysctl_table(ipc_root_table); + if (!setup_ipc_sysctls(&init_ipc_ns)) { + pr_warn("ipc sysctl registration failed\n"); + return -ENOMEM; + } return 0; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 7bda9471d450b..dae5f8affd7ca 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -63,6 +63,9 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (!setup_mq_sysctls(ns)) goto fail_put; + if (!setup_ipc_sysctls(ns)) + goto fail_put; + sem_init_ns(ns); msg_init_ns(ns); shm_init_ns(ns); @@ -130,6 +133,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) shm_exit_ns(ns); retire_mq_sysctls(ns); + retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); From patchwork Thu Mar 21 12:14:53 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914392 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kwH2Z2Xz1yWs for ; Thu, 21 Mar 2024 23:16:47 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHL9-00080k-8y; Thu, 21 Mar 2024 12:16:16 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKE-0007tO-JA for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:16 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id 2464C445A8; Thu, 21 Mar 2024 12:15:14 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 5/9] ipc: Use the same namespace to modify and validate Date: Thu, 21 Mar 2024 13:14:53 +0100 Message-ID: <20240321121457.362921-6-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Alexey Gladkov BugLink: https://bugs.launchpad.net/bugs/2058485 In the 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace") I missed that in addition to the modification of sem_ctls[3], the change is validated. This validation must occur in the same namespace. Link: https://lore.kernel.org/lkml/875ymnvryb.fsf@email.froward.int.ebiederm.org/ Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace") Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b3cb9a25cce6becbef77186bc1216071a08a969b.1651584847.git.legion@kernel.org Signed-off-by: Eric W. Biederman (cherry picked from commit def7343ff03bbb36ce7a34dcb19cab599f0da446) Signed-off-by: Thibault Ferrante --- ipc/ipc_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 15210ac47e9e1..d1d5204cf5893 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -68,7 +68,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) - ret = sem_check_semmni(current->nsproxy->ipc_ns); + ret = sem_check_semmni(ns); /* * Reset the semmni value if an error happens. From patchwork Thu Mar 21 12:14:54 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914397 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kxY6lyzz1yWs for ; Thu, 21 Mar 2024 23:17:53 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHMY-0000Lg-L5; Thu, 21 Mar 2024 12:17:38 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKF-0007tR-9P for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:16 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id EE902445A9; Thu, 21 Mar 2024 12:15:14 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 6/9] ipc: Remove extra1 field abuse to pass ipc namespace Date: Thu, 21 Mar 2024 13:14:54 +0100 Message-ID: <20240321121457.362921-7-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Alexey Gladkov BugLink: https://bugs.launchpad.net/bugs/2058485 Eric Biederman pointed out that using .extra1 to pass ipc namespace looks like an ugly hack and there is a better solution. We can get the ipc_namespace using the .data field. Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/ Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace") Signed-off-by: Eric W. Biederman Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/93df64a8fe93ba20ebbe1d9f8eda484b2f325426.1651584847.git.legion@kernel.org Signed-off-by: Eric W. Biederman (cherry picked from commit dd141a4955d5ebbb3f4c7996796e86a3ac9ed57f) Signed-off-by: Thibault Ferrante --- ipc/ipc_sysctl.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index d1d5204cf5893..eb7ba8e0a355c 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -19,16 +19,11 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct ipc_namespace *ns = table->extra1; - struct ctl_table ipc_table; + struct ipc_namespace *ns = + container_of(table->data, struct ipc_namespace, shm_rmid_forced); int err; - memcpy(&ipc_table, table, sizeof(ipc_table)); - - ipc_table.extra1 = SYSCTL_ZERO; - ipc_table.extra2 = SYSCTL_ONE; - - err = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err < 0) return err; @@ -55,15 +50,10 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct ipc_namespace *ns = table->extra1; - struct ctl_table ipc_table; + struct ipc_namespace *ns = + container_of(table->data, struct ipc_namespace, sem_ctls); int ret, semmni; - memcpy(&ipc_table, table, sizeof(ipc_table)); - - ipc_table.extra1 = NULL; - ipc_table.extra2 = NULL; - semmni = ns->sem_ctls[3]; ret = proc_dointvec(table, write, buffer, lenp, ppos); @@ -131,6 +121,8 @@ static struct ctl_table ipc_sysctls[] = { .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", @@ -237,7 +229,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns) } else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) { tbl[i].data = &ns->shm_rmid_forced; - tbl[i].extra1 = ns; } else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) { tbl[i].data = &ns->msg_ctlmax; @@ -250,7 +241,6 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns) } else if (tbl[i].data == &init_ipc_ns.sem_ctls) { tbl[i].data = &ns->sem_ctls; - tbl[i].extra1 = ns; #ifdef CONFIG_CHECKPOINT_RESTORE } else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) { tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; From patchwork Thu Mar 21 12:14:55 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914398 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kxd4LSwz1yWs for ; Thu, 21 Mar 2024 23:17:57 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHMa-0000Qd-1r; Thu, 21 Mar 2024 12:17:40 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKG-0007tZ-6o for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:17 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id DA795445AA; Thu, 21 Mar 2024 12:15:15 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 7/9] ipc: Check permissions for checkpoint_restart sysctls at open time Date: Thu, 21 Mar 2024 13:14:55 +0100 Message-ID: <20240321121457.362921-8-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Alexey Gladkov BugLink: https://bugs.launchpad.net/bugs/2058485 As Eric Biederman pointed out, it is possible not to use a custom proc_handler and check permissions for every write, but to use a .permission handler. That will allow the checkpoint_restart sysctls to perform all of their permission checks at open time, and not need any other special code. Link: https://lore.kernel.org/lkml/87czib9g38.fsf@email.froward.int.ebiederm.org/ Fixes: 1f5c135ee509 ("ipc: Store ipc sysctls in the ipc namespace") Signed-off-by: Eric W. Biederman Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/65fa8459803830608da4610a39f33c76aa933eb9.1651584847.git.legion@kernel.org Signed-off-by: Eric W. Biederman (cherry picked from commit 0889f44e281034e180daa6daf3e2d57c012452d4) Signed-off-by: Thibault Ferrante --- ipc/ipc_sysctl.c | 57 ++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index eb7ba8e0a355c..5a58598d48c8c 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -68,25 +68,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_CHECKPOINT_RESTORE -static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table, - int write, void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ipc_namespace *ns = table->extra1; - struct ctl_table ipc_table; - - if (write && !checkpoint_restore_ns_capable(ns->user_ns)) - return -EPERM; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - - ipc_table.extra1 = SYSCTL_ZERO; - ipc_table.extra2 = SYSCTL_INT_MAX; - - return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); -} -#endif - int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; @@ -172,22 +153,28 @@ static struct ctl_table ipc_sysctls[] = { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), - .mode = 0666, - .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), - .mode = 0666, - .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), - .mode = 0666, - .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, #endif {} @@ -203,8 +190,25 @@ static int set_is_seen(struct ctl_table_set *set) return ¤t->nsproxy->ipc_ns->ipc_set == set; } +static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table) +{ + int mode = table->mode; + +#ifdef CONFIG_CHECKPOINT_RESTORE + struct ipc_namespace *ns = current->nsproxy->ipc_ns; + + if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || + (table->data == &ns->ids[IPC_MSG_IDS].next_id) || + (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && + checkpoint_restore_ns_capable(ns->user_ns)) + mode = 0666; +#endif + return mode; +} + static struct ctl_table_root set_root = { .lookup = set_lookup, + .permissions = ipc_permissions, }; bool setup_ipc_sysctls(struct ipc_namespace *ns) @@ -244,15 +248,12 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns) #ifdef CONFIG_CHECKPOINT_RESTORE } else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) { tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; - tbl[i].extra1 = ns; } else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) { tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; - tbl[i].extra1 = ns; } else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) { tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; - tbl[i].extra1 = ns; #endif } else { tbl[i].data = NULL; From patchwork Thu Mar 21 12:14:56 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914399 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kxr0198z1yWs for ; Thu, 21 Mar 2024 23:18:07 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHMq-0000ai-8Y; Thu, 21 Mar 2024 12:17:56 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKG-0007to-UL for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:18 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id 97809445A4; Thu, 21 Mar 2024 12:15:16 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 8/9] percpu: add percpu_counter_add_local and percpu_counter_sub_local Date: Thu, 21 Mar 2024 13:14:56 +0100 Message-ID: <20240321121457.362921-9-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Jiebin Sun BugLink: https://bugs.launchpad.net/bugs/2058485 Patch series "/msg: mitigate the lock contention in ipc/msg", v6. Here are two patches to mitigate the lock contention in ipc/msg. The 1st patch is to add the new interface percpu_counter_add_local and percpu_counter_sub_local. The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg. The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. This patch (of 2): The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. Link: https://lkml.kernel.org/r/20220913192538.3023708-1-jiebin.sun@intel.com Link: https://lkml.kernel.org/r/20220913192538.3023708-2-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Davidlohr Bueso Signed-off-by: Andrew Morton (cherry picked from commit 5d0ce3595ab75330a15cec914096efbbb8b41e4a) Signed-off-by: Thibault Ferrante --- include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 01861eebed79d..8ed5fba6d156f 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -15,6 +15,9 @@ #include #include +/* percpu_counter batch for local add or sub */ +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX + #ifdef CONFIG_SMP struct percpu_counter { @@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +/* + * With percpu_counter_add_local() and percpu_counter_sub_local(), counts + * are accumulated in local per cpu counter and not in fbc->count until + * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter + * write efficient. + * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be + * used to add up the counts from each CPU to account for all the local + * counts. So percpu_counter_add_local() and percpu_counter_sub_local() + * should be used when a counter is updated frequently and read rarely. + */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); +} + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); @@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) preempt_enable(); } +/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, amount); +} + static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { @@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) percpu_counter_add(fbc, -amount); } +static inline void +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_local(fbc, -amount); +} + #endif /* _LINUX_PERCPU_COUNTER_H */ From patchwork Thu Mar 21 12:14:57 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Thibault Ferrante X-Patchwork-Id: 1914393 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@legolas.ozlabs.org Authentication-Results: legolas.ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=lists.ubuntu.com (client-ip=185.125.189.65; helo=lists.ubuntu.com; envelope-from=kernel-team-bounces@lists.ubuntu.com; receiver=patchwork.ozlabs.org) Received: from lists.ubuntu.com (lists.ubuntu.com [185.125.189.65]) (using TLSv1.2 with cipher ECDHE-ECDSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by legolas.ozlabs.org (Postfix) with ESMTPS id 4V0kwv62cwz1yWs for ; Thu, 21 Mar 2024 23:17:19 +1100 (AEDT) Received: from localhost ([127.0.0.1] helo=lists.ubuntu.com) by lists.ubuntu.com with esmtp (Exim 4.86_2) (envelope-from ) id 1rnHLu-0008Et-3d; Thu, 21 Mar 2024 12:17:00 +0000 Received: from smtp-relay-canonical-1.internal ([10.131.114.174] helo=smtp-relay-canonical-1.canonical.com) by lists.ubuntu.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.86_2) (envelope-from ) id 1rnHKH-0007tq-KZ for kernel-team@lists.ubuntu.com; Thu, 21 Mar 2024 12:15:18 +0000 Received: from Q58-sff.buildd (2.general.thibf.uk.vpn [10.172.200.120]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by smtp-relay-canonical-1.canonical.com (Postfix) with ESMTPSA id 606ED445A6; Thu, 21 Mar 2024 12:15:17 +0000 (UTC) From: Thibault Ferrante To: kernel-team@lists.ubuntu.com Subject: [SRU][J][PATCH 9/9] ipc/msg: mitigate the lock contention with percpu counter Date: Thu, 21 Mar 2024 13:14:57 +0100 Message-ID: <20240321121457.362921-10-thibault.ferrante@canonical.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240321121457.362921-1-thibault.ferrante@canonical.com> References: <20240321121457.362921-1-thibault.ferrante@canonical.com> MIME-Version: 1.0 X-BeenThere: kernel-team@lists.ubuntu.com X-Mailman-Version: 2.1.20 Precedence: list List-Id: Kernel team discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" From: Jiebin Sun BugLink: https://bugs.launchpad.net/bugs/2058485 The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. Apply the patch and test the pts/stress-ng-1.4.0 -- system v message passing (160 threads). Score gain: 3.99x CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/stress-ng-1.4.0 -- system v message passing (160 threads) [akpm@linux-foundation.org: coding-style cleanups] [jiebin.sun@intel.com: avoid negative value by overflow in msginfo] Link: https://lkml.kernel.org/r/20220920150809.4014944-1-jiebin.sun@intel.com [akpm@linux-foundation.org: fix min() warnings] Link: https://lkml.kernel.org/r/20220913192538.3023708-3-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Signed-off-by: Andrew Morton (cherry picked from commit 72d1e611082eda18689106a0c192f2827072713c) Signed-off-by: Thibault Ferrante --- include/linux/ipc_namespace.h | 5 ++-- ipc/msg.c | 48 +++++++++++++++++++++++++---------- ipc/namespace.c | 5 +++- ipc/util.h | 4 +-- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 2c4ef98ae436c..e02b71e3fd6aa 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -11,6 +11,7 @@ #include #include #include +#include struct user_namespace; @@ -36,8 +37,8 @@ struct ipc_namespace { unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; - atomic_t msg_bytes; - atomic_t msg_hdrs; + struct percpu_counter percpu_msg_bytes; + struct percpu_counter percpu_msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; diff --git a/ipc/msg.c b/ipc/msg.c index a0d05775af2c5..e4e0990e08f75 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -495,17 +496,22 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd == MSG_INFO) { + if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; - msginfo->msgmap = atomic_read(&ns->msg_hdrs); - msginfo->msgtql = atomic_read(&ns->msg_bytes); + max_idx = ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd == MSG_INFO) { + msginfo->msgmap = min_t(int, + percpu_counter_sum(&ns->percpu_msg_hdrs), + INT_MAX); + msginfo->msgtql = min_t(int, + percpu_counter_sum(&ns->percpu_msg_bytes), + INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_idx = ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } @@ -935,8 +941,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; @@ -1159,8 +1165,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; @@ -1297,20 +1303,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + +fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); +fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); diff --git a/ipc/namespace.c b/ipc/namespace.c index dae5f8affd7ca..556178d2a4ecf 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (!setup_ipc_sysctls(ns)) goto fail_put; + err = msg_init_ns(ns); + if (err) + goto fail_put; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); return ns; diff --git a/ipc/util.h b/ipc/util.h index 2dd7ce0416d8e..b2906e3665394 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { } #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { }