sched: Fix balance vs hotplug race
Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo
sched domain managment) we have cpu_active_mask which is suppose to rule
scheduler migration and load-balancing, except it never (fully) did.
The particular problem being solved here is a crash in try_to_wake_up()
where select_task_rq() ends up selecting an offline cpu because
select_task_rq_fair() trusts the sched_domain tree to reflect the
current state of affairs, similarly select_task_rq_rt() trusts the
root_domain.
However, the sched_domains are updated from CPU_DEAD, which is after the
cpu is taken offline and after stop_machine is done. Therefore it can
race perfectly well with code assuming the domains are right.
Cure this by building the domains from cpu_active_mask on
CPU_DOWN_PREPARE.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 43fb7e8..ba401fa 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -872,7 +872,7 @@
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
return -EINVAL;
}
retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@
}
/* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
+ if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
@@ -2019,7 +2019,7 @@
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_online_mask);
+ cpu_active_mask);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@
switch (phase) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
break;
default:
@@ -2067,7 +2069,7 @@
cgroup_lock();
mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
mutex_unlock(&callback_mutex);
scan_for_empty_cpusets(&top_cpuset);
ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
hotcpu_notifier(cpuset_track_online_cpus, 0);