exec: rework the group exit and fix the race with kill As Roland pointed out, we have the very old problem with exec. de_thread() sets SIGNAL_GROUP_EXIT, kills other threads, changes ->group_leader and then clears signal->flags. All signals (even fatal ones) sent in this window (which is not too small) will be lost. With this patch exec doesn't abuse SIGNAL_GROUP_EXIT. signal_group_exit(), the new helper, should be used to detect exit_group() or exec() in progress. It can have more users, but this patch does only strictly necessary changes. Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Robin Holt <holt@sgi.com> Cc: Roland McGrath <roland@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: ed5d2cac114202fe2978a9cbcab8f5032796d538 [log] [tgz]
author: Oleg Nesterov <oleg@tv-sign.ru> Mon Feb 04 22:27:24 2008 -0800
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> Tue Feb 05 09:44:07 2008 -0800
tree: aa9aaea1aa0945bd9159685d1b04897d105a90c9
parent: f558b7e408026eb3c6afcd0e8fc1f7fe31195a6a [diff]
diff --git a/fs/exec.c b/fs/exec.c
index 966c5c5..be923e4 100644
--- a/fs/exec.c
+++ b/fs/exec.c

@@ -760,7 +760,7 @@
 	 */
 	read_lock(&tasklist_lock);
 	spin_lock_irq(lock);
-	if (sig->flags & SIGNAL_GROUP_EXIT) {
+	if (signal_group_exit(sig)) {
 		/*
 		 * Another group action in progress, just
 		 * return so that the signal is processed.
@@ -778,6 +778,7 @@
 	if (unlikely(tsk->group_leader == task_child_reaper(tsk)))
 		task_active_pid_ns(tsk)->child_reaper = tsk;
 
+	sig->group_exit_task = tsk;
 	zap_other_threads(tsk);
 	read_unlock(&tasklist_lock);
 
@@ -802,7 +803,6 @@
 	}
 
 	sig->notify_count = count;
-	sig->group_exit_task = tsk;
 	while (atomic_read(&sig->count) > count) {
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(lock);
@@ -871,15 +871,10 @@
 		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
-        }
+	}
 
 	sig->group_exit_task = NULL;
 	sig->notify_count = 0;
-	/*
-	 * There may be one thread left which is just exiting,
-	 * but it's safe to stop telling the group to kill themselves.
-	 */
-	sig->flags = 0;
 
 no_thread_group:
 	exit_itimers(sig);
@@ -1549,7 +1544,7 @@
 	int err = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
-	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+	if (!signal_group_exit(tsk->signal)) {
 		tsk->signal->group_exit_code = exit_code;
 		zap_process(tsk);
 		err = 0;

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 680bb03..483ea4e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -555,6 +555,13 @@
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 
+/* If true, all threads except ->group_exit_task have pending SIGKILL */
+static inline int signal_group_exit(const struct signal_struct *sig)
+{
+	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
+		(sig->group_exit_task != NULL);
+}
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */

diff --git a/kernel/exit.c b/kernel/exit.c
index 9e459fe..9d3d0f0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c

@@ -1083,11 +1083,12 @@
 		struct signal_struct *const sig = current->signal;
 		struct sighand_struct *const sighand = current->sighand;
 		spin_lock_irq(&sighand->siglock);
-		if (sig->flags & SIGNAL_GROUP_EXIT)
+		if (signal_group_exit(sig))
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
 			sig->group_exit_code = exit_code;
+			sig->flags = SIGNAL_GROUP_EXIT;
 			zap_other_threads(current);
 		}
 		spin_unlock_irq(&sighand->siglock);

diff --git a/kernel/signal.c b/kernel/signal.c
index 1117b28..6a5f97c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c

@@ -957,7 +957,6 @@
 {
 	struct task_struct *t;
 
-	p->signal->flags = SIGNAL_GROUP_EXIT;
 	p->signal->group_stop_count = 0;
 
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
@@ -1697,7 +1696,8 @@
 	} else {
 		struct task_struct *t;
 
-		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+		    unlikely(sig->group_exit_task))
 			return 0;
 		/*
 		 * There is no group stop already in progress.
commit	ed5d2cac114202fe2978a9cbcab8f5032796d538	[log] [tgz]
author	Oleg Nesterov <oleg@tv-sign.ru>	Mon Feb 04 22:27:24 2008 -0800
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	Tue Feb 05 09:44:07 2008 -0800
tree	aa9aaea1aa0945bd9159685d1b04897d105a90c9
parent	f558b7e408026eb3c6afcd0e8fc1f7fe31195a6a [diff]