diff mbox

[next-20170609] Oops while running CPU off-on (cpuset.c/cpuset_can_attach)

Message ID 20170705152855.GD19330@htj.duckdns.org (mailing list archive)
State Not Applicable
Headers show

Commit Message

Tejun Heo July 5, 2017, 3:28 p.m. UTC
Hello, Abdul.

Thanks for the debug info.  Can you please see whether the following
patch fixes the issue?  If the problem is too difficult to reproduce
to confirm the fix by seeing whether it no longer triggers, please let
me know.  We can instead apply a patch which triggers WARN on the
failing condition to confirm the diagnosis.

Thanks.

Comments

Abdul Haleem July 7, 2017, 6:39 a.m. UTC | #1
On Wed, 2017-07-05 at 11:28 -0400, Tejun Heo wrote:
> Hello, Abdul.
> 
> Thanks for the debug info.  Can you please see whether the following
> patch fixes the issue?  

It is my pleasure and yes the patch fixes the problem.

> If the problem is too difficult to reproduce

The problem was reproducible all the time. 

With the patch fix, I tried multiple times and long runs of cpu off-on
cycles but no Oops is seen.

Thank you for spending your valuable time on fixing this issue.

Reported-and-tested-by : Abdul Haleem <abdhalee@linux.vnet.ibm.com>

> to confirm the fix by seeing whether it no longer triggers, please let
> me know.  We can instead apply a patch which triggers WARN on the
> failing condition to confirm the diagnosis.
> 
> Thanks.
> 
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 793565c05742..8b4c3c2f2509 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -33,6 +33,9 @@ struct cgroup_taskset {
>  	struct list_head	src_csets;
>  	struct list_head	dst_csets;
> 
> +	/* the number of tasks in the set */
> +	int			nr_tasks;
> +
>  	/* the subsys currently being processed */
>  	int			ssid;
> 
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index dbfd7028b1c6..e3c4152741a3 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1954,6 +1954,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
>  	if (!cset->mg_src_cgrp)
>  		return;
> 
> +	mgctx->tset.nr_tasks++;
> +
>  	list_move_tail(&task->cg_list, &cset->mg_tasks);
>  	if (list_empty(&cset->mg_node))
>  		list_add_tail(&cset->mg_node,
> @@ -2047,16 +2049,18 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
>  		return 0;
> 
>  	/* check that we can legitimately attach to the cgroup */
> -	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
> -		if (ss->can_attach) {
> -			tset->ssid = ssid;
> -			ret = ss->can_attach(tset);
> -			if (ret) {
> -				failed_ssid = ssid;
> -				goto out_cancel_attach;
> +	if (tset->nr_tasks) {
> +		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
> +			if (ss->can_attach) {
> +				tset->ssid = ssid;
> +				ret = ss->can_attach(tset);
> +				if (ret) {
> +					failed_ssid = ssid;
> +					goto out_cancel_attach;
> +				}
>  			}
> -		}
> -	} while_each_subsys_mask();
> +		} while_each_subsys_mask();
> +	}
> 
>  	/*
>  	 * Now that we're guaranteed success, proceed to move all tasks to
> @@ -2085,25 +2089,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
>  	 */
>  	tset->csets = &tset->dst_csets;
> 
> -	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
> -		if (ss->attach) {
> -			tset->ssid = ssid;
> -			ss->attach(tset);
> -		}
> -	} while_each_subsys_mask();
> +	if (tset->nr_tasks) {
> +		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
> +			if (ss->attach) {
> +				tset->ssid = ssid;
> +				ss->attach(tset);
> +			}
> +		} while_each_subsys_mask();
> +	}
> 
>  	ret = 0;
>  	goto out_release_tset;
> 
>  out_cancel_attach:
> -	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
> -		if (ssid == failed_ssid)
> -			break;
> -		if (ss->cancel_attach) {
> -			tset->ssid = ssid;
> -			ss->cancel_attach(tset);
> -		}
> -	} while_each_subsys_mask();
> +	if (tset->nr_tasks) {
> +		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
> +			if (ssid == failed_ssid)
> +				break;
> +			if (ss->cancel_attach) {
> +				tset->ssid = ssid;
> +				ss->cancel_attach(tset);
> +			}
> +		} while_each_subsys_mask();
> +	}
>  out_release_tset:
>  	spin_lock_irq(&css_set_lock);
>  	list_splice_init(&tset->dst_csets, &tset->src_csets);
>
diff mbox

Patch

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 793565c05742..8b4c3c2f2509 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -33,6 +33,9 @@  struct cgroup_taskset {
 	struct list_head	src_csets;
 	struct list_head	dst_csets;
 
+	/* the number of tasks in the set */
+	int			nr_tasks;
+
 	/* the subsys currently being processed */
 	int			ssid;
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dbfd7028b1c6..e3c4152741a3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1954,6 +1954,8 @@  static void cgroup_migrate_add_task(struct task_struct *task,
 	if (!cset->mg_src_cgrp)
 		return;
 
+	mgctx->tset.nr_tasks++;
+
 	list_move_tail(&task->cg_list, &cset->mg_tasks);
 	if (list_empty(&cset->mg_node))
 		list_add_tail(&cset->mg_node,
@@ -2047,16 +2049,18 @@  static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 		return 0;
 
 	/* check that we can legitimately attach to the cgroup */
-	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-		if (ss->can_attach) {
-			tset->ssid = ssid;
-			ret = ss->can_attach(tset);
-			if (ret) {
-				failed_ssid = ssid;
-				goto out_cancel_attach;
+	if (tset->nr_tasks) {
+		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+			if (ss->can_attach) {
+				tset->ssid = ssid;
+				ret = ss->can_attach(tset);
+				if (ret) {
+					failed_ssid = ssid;
+					goto out_cancel_attach;
+				}
 			}
-		}
-	} while_each_subsys_mask();
+		} while_each_subsys_mask();
+	}
 
 	/*
 	 * Now that we're guaranteed success, proceed to move all tasks to
@@ -2085,25 +2089,29 @@  static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 	 */
 	tset->csets = &tset->dst_csets;
 
-	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-		if (ss->attach) {
-			tset->ssid = ssid;
-			ss->attach(tset);
-		}
-	} while_each_subsys_mask();
+	if (tset->nr_tasks) {
+		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+			if (ss->attach) {
+				tset->ssid = ssid;
+				ss->attach(tset);
+			}
+		} while_each_subsys_mask();
+	}
 
 	ret = 0;
 	goto out_release_tset;
 
 out_cancel_attach:
-	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
-		if (ssid == failed_ssid)
-			break;
-		if (ss->cancel_attach) {
-			tset->ssid = ssid;
-			ss->cancel_attach(tset);
-		}
-	} while_each_subsys_mask();
+	if (tset->nr_tasks) {
+		do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+			if (ssid == failed_ssid)
+				break;
+			if (ss->cancel_attach) {
+				tset->ssid = ssid;
+				ss->cancel_attach(tset);
+			}
+		} while_each_subsys_mask();
+	}
 out_release_tset:
 	spin_lock_irq(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);