mm: revert "oom: move oom_adj value"
The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to
the mm_struct. It was a very good first step for sanitize OOM.
However Paul Menage reported the commit makes regression to his job
scheduler. Current OOM logic can kill OOM_DISABLED process.
Why? His program has the code of similar to the following.
...
set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */
...
if (vfork() == 0) {
set_oom_adj(0); /* Invoked child can be killed */
execve("foo-bar-cmd");
}
....
vfork() parent and child are shared the same mm_struct. then above
set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also
change oom_adj for vfork() parent. Then, vfork() parent (job scheduler)
lost OOM immune and it was killed.
Actually, fork-setting-exec idiom is very frequently used in userland program.
We must not break this assumption.
Then, this patch revert commit 2ff05b2b and related commit.
Reverted commit list
---------------------
- commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct)
- commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE)
- commit 8123681022 (oom: only oom kill exiting tasks with attached memory)
- commit 933b787b57 (mm: copy over oom_adj value at fork time)
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a..a7b2460 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@
unsigned long points, cpu_time, run_time;
struct mm_struct *mm;
struct task_struct *child;
- int oom_adj;
task_lock(p);
mm = p->mm;
@@ -66,11 +65,6 @@
task_unlock(p);
return 0;
}
- oom_adj = mm->oom_adj;
- if (oom_adj == OOM_DISABLE) {
- task_unlock(p);
- return 0;
- }
/*
* The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@
points /= 8;
/*
- * Adjust the score by oom_adj.
+ * Adjust the score by oomkilladj.
*/
- if (oom_adj) {
- if (oom_adj > 0) {
+ if (p->oomkilladj) {
+ if (p->oomkilladj > 0) {
if (!points)
points = 1;
- points <<= oom_adj;
+ points <<= p->oomkilladj;
} else
- points >>= -(oom_adj);
+ points >>= -(p->oomkilladj);
}
#ifdef DEBUG
@@ -257,8 +251,11 @@
*ppoints = ULONG_MAX;
}
+ if (p->oomkilladj == OOM_DISABLE)
+ continue;
+
points = badness(p, uptime.tv_sec);
- if (points > *ppoints) {
+ if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
@@ -307,7 +304,8 @@
}
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
- get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
+ get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+ p->comm);
task_unlock(p);
} while_each_thread(g, p);
}
@@ -325,8 +323,11 @@
return;
}
- if (!p->mm)
+ if (!p->mm) {
+ WARN_ON(1);
+ printk(KERN_WARNING "tried to kill an mm-less task!\n");
return;
+ }
if (verbose)
printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@
struct mm_struct *mm;
struct task_struct *g, *q;
- task_lock(p);
mm = p->mm;
- if (!mm || mm->oom_adj == OOM_DISABLE) {
- task_unlock(p);
+
+ /* WARNING: mm may not be dereferenced since we did not obtain its
+ * value from get_task_mm(p). This is OK since all we need to do is
+ * compare mm to q->mm below.
+ *
+ * Furthermore, even if mm contains a non-NULL value, p->mm may
+ * change to NULL at any time since we do not hold task_lock(p).
+ * However, this is of no concern to us.
+ */
+
+ if (mm == NULL)
return 1;
- }
- task_unlock(p);
+
+ /*
+ * Don't kill the process if any threads are set to OOM_DISABLE
+ */
+ do_each_thread(g, q) {
+ if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+ return 1;
+ } while_each_thread(g, q);
+
__oom_kill_task(p, 1);
/*
@@ -377,11 +393,10 @@
struct task_struct *c;
if (printk_ratelimit()) {
- task_lock(current);
printk(KERN_WARNING "%s invoked oom-killer: "
- "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
- current->comm, gfp_mask, order,
- current->mm ? current->mm->oom_adj : OOM_DISABLE);
+ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+ current->comm, gfp_mask, order, current->oomkilladj);
+ task_lock(current);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
@@ -394,9 +409,8 @@
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
- * if its mm is still attached.
*/
- if (p->mm && (p->flags & PF_EXITING)) {
+ if (p->flags & PF_EXITING) {
__oom_kill_task(p, 0);
return 0;
}