pid namespaces: changes to show virtual ids to user
This is the largest patch in the set. Make all (I hope) the places where
the pid is shown to or get from user operate on the virtual pids.
The idea is:
- all in-kernel data structures must store either struct pid itself
or the pid's global nr, obtained with pid_nr() call;
- when seeking the task from kernel code with the stored id one
should use find_task_by_pid() call that works with global pids;
- when showing pid's numerical value to the user the virtual one
should be used, but however when one shows task's pid outside this
task's namespace the global one is to be used;
- when getting the pid from userspace one need to consider this as
the virtual one and use appropriate task/pid-searching functions.
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: nuther build fix]
[akpm@linux-foundation.org: yet nuther build fix]
[akpm@linux-foundation.org: remove unneeded casts]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Paul Menage <menage@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/kernel/capability.c b/kernel/capability.c
index f02ad47..d4377c5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -62,8 +62,9 @@
spin_lock(&task_capability_lock);
read_lock(&tasklist_lock);
- if (pid && pid != current->pid) {
- target = find_task_by_pid(pid);
+ if (pid && pid != task_pid_vnr(current)) {
+ target = find_task_by_pid_ns(pid,
+ current->nsproxy->pid_ns);
if (!target) {
ret = -ESRCH;
goto out;
@@ -96,7 +97,7 @@
int found = 0;
struct pid *pgrp;
- pgrp = find_pid(pgrp_nr);
+ pgrp = find_pid_ns(pgrp_nr, current->nsproxy->pid_ns);
do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
target = g;
while_each_thread(g, target) {
@@ -185,7 +186,7 @@
if (get_user(pid, &header->pid))
return -EFAULT;
- if (pid && pid != current->pid && !capable(CAP_SETPCAP))
+ if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
return -EPERM;
if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
@@ -196,8 +197,9 @@
spin_lock(&task_capability_lock);
read_lock(&tasklist_lock);
- if (pid > 0 && pid != current->pid) {
- target = find_task_by_pid(pid);
+ if (pid > 0 && pid != task_pid_vnr(current)) {
+ target = find_task_by_pid_ns(pid,
+ current->nsproxy->pid_ns);
if (!target) {
ret = -ESRCH;
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 567909f..68d2703 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1112,15 +1112,17 @@
static int eligible_child(pid_t pid, int options, struct task_struct *p)
{
int err;
+ struct pid_namespace *ns;
+ ns = current->nsproxy->pid_ns;
if (pid > 0) {
- if (p->pid != pid)
+ if (task_pid_nr_ns(p, ns) != pid)
return 0;
} else if (!pid) {
- if (task_pgrp_nr(p) != task_pgrp_nr(current))
+ if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
return 0;
} else if (pid != -1) {
- if (task_pgrp_nr(p) != -pid)
+ if (task_pgrp_nr_ns(p, ns) != -pid)
return 0;
}
@@ -1190,9 +1192,12 @@
{
unsigned long state;
int retval, status, traced;
+ struct pid_namespace *ns;
+
+ ns = current->nsproxy->pid_ns;
if (unlikely(noreap)) {
- pid_t pid = p->pid;
+ pid_t pid = task_pid_nr_ns(p, ns);
uid_t uid = p->uid;
int exit_code = p->exit_code;
int why, status;
@@ -1311,11 +1316,11 @@
retval = put_user(status, &infop->si_status);
}
if (!retval && infop)
- retval = put_user(p->pid, &infop->si_pid);
+ retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
if (!retval && infop)
retval = put_user(p->uid, &infop->si_uid);
if (!retval)
- retval = p->pid;
+ retval = task_pid_nr_ns(p, ns);
if (traced) {
write_lock_irq(&tasklist_lock);
@@ -1352,6 +1357,7 @@
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code;
+ struct pid_namespace *ns;
if (!p->exit_code)
return 0;
@@ -1370,11 +1376,12 @@
* keep holding onto the tasklist_lock while we call getrusage and
* possibly take page faults for user memory.
*/
+ ns = current->nsproxy->pid_ns;
get_task_struct(p);
read_unlock(&tasklist_lock);
if (unlikely(noreap)) {
- pid_t pid = p->pid;
+ pid_t pid = task_pid_nr_ns(p, ns);
uid_t uid = p->uid;
int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
@@ -1445,11 +1452,11 @@
if (!retval && infop)
retval = put_user(exit_code, &infop->si_status);
if (!retval && infop)
- retval = put_user(p->pid, &infop->si_pid);
+ retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
if (!retval && infop)
retval = put_user(p->uid, &infop->si_uid);
if (!retval)
- retval = p->pid;
+ retval = task_pid_nr_ns(p, ns);
put_task_struct(p);
BUG_ON(!retval);
@@ -1469,6 +1476,7 @@
int retval;
pid_t pid;
uid_t uid;
+ struct pid_namespace *ns;
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
return 0;
@@ -1483,7 +1491,8 @@
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
spin_unlock_irq(&p->sighand->siglock);
- pid = p->pid;
+ ns = current->nsproxy->pid_ns;
+ pid = task_pid_nr_ns(p, ns);
uid = p->uid;
get_task_struct(p);
read_unlock(&tasklist_lock);
@@ -1494,7 +1503,7 @@
if (!retval && stat_addr)
retval = put_user(0xffff, stat_addr);
if (!retval)
- retval = p->pid;
+ retval = task_pid_nr_ns(p, ns);
} else {
retval = wait_noreap_copyout(p, pid, uid,
CLD_CONTINUED, SIGCONT,
diff --git a/kernel/fork.c b/kernel/fork.c
index ce9297e..a794bfc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -950,7 +950,7 @@
{
current->clear_child_tid = tidptr;
- return current->pid;
+ return task_pid_vnr(current);
}
static inline void rt_mutex_init_task(struct task_struct *p)
diff --git a/kernel/futex.c b/kernel/futex.c
index e45a65e..86b26003 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -53,6 +53,9 @@
#include <linux/signal.h>
#include <linux/module.h>
#include <linux/magic.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+
#include <asm/futex.h>
#include "rtmutex_common.h"
@@ -443,7 +446,8 @@
struct task_struct *p;
rcu_read_lock();
- p = find_task_by_pid(pid);
+ p = find_task_by_pid_ns(pid,
+ current->nsproxy->pid_ns);
if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
p = ERR_PTR(-ESRCH);
@@ -653,7 +657,7 @@
if (!(uval & FUTEX_OWNER_DIED)) {
int ret = 0;
- newval = FUTEX_WAITERS | new_owner->pid;
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
@@ -1106,7 +1110,7 @@
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
struct task_struct *curr)
{
- u32 newtid = curr->pid | FUTEX_WAITERS;
+ u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, curval, newval;
int ret;
@@ -1368,7 +1372,7 @@
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
- newval = current->pid;
+ newval = task_pid_vnr(current);
curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
@@ -1379,7 +1383,7 @@
* Detect deadlocks. In case of REQUEUE_PI this is a valid
* situation and we return success to user space.
*/
- if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+ if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -1408,7 +1412,7 @@
*/
if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
/* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | current->pid;
+ newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
ownerdied = 0;
lock_taken = 1;
}
@@ -1587,7 +1591,7 @@
/*
* We release only a lock we actually own:
*/
- if ((uval & FUTEX_TID_MASK) != current->pid)
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
/*
* First take all the futex related locks:
@@ -1608,7 +1612,7 @@
* anyone else up:
*/
if (!(uval & FUTEX_OWNER_DIED))
- uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0);
+ uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
if (unlikely(uval == -EFAULT))
@@ -1617,7 +1621,7 @@
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
- if (unlikely(uval == current->pid))
+ if (unlikely(uval == task_pid_vnr(current)))
goto out_unlock;
/*
@@ -1854,7 +1858,8 @@
ret = -ESRCH;
rcu_read_lock();
- p = find_task_by_pid(pid);
+ p = find_task_by_pid_ns(pid,
+ current->nsproxy->pid_ns);
if (!p)
goto err_unlock;
ret = -EPERM;
@@ -1887,7 +1892,7 @@
if (get_user(uval, uaddr))
return -1;
- if ((uval & FUTEX_TID_MASK) == curr->pid) {
+ if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
/*
* Ok, this dying thread is truly holding a futex
* of interest. Set the OWNER_DIED bit atomically
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c2e295..cc098e1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -8,6 +8,7 @@
#include <linux/linkage.h>
#include <linux/compat.h>
+#include <linux/nsproxy.h>
#include <linux/futex.h>
#include <asm/uaccess.h>
@@ -124,7 +125,8 @@
ret = -ESRCH;
read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
+ p = find_task_by_pid_ns(pid,
+ current->nsproxy->pid_ns);
if (!p)
goto err_unlock;
ret = -EPERM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a73ebd3..66e99eb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -19,6 +19,7 @@
#include <linux/security.h>
#include <linux/signal.h>
#include <linux/audit.h>
+#include <linux/pid_namespace.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
@@ -443,7 +444,8 @@
return ERR_PTR(-EPERM);
read_lock(&tasklist_lock);
- child = find_task_by_pid(pid);
+ child = find_task_by_pid_ns(pid,
+ current->nsproxy->pid_ns);
if (child)
get_task_struct(child);
diff --git a/kernel/sched.c b/kernel/sched.c
index 72a2a16..4ac56fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -44,6 +44,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
+#include <linux/pid_namespace.h>
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
@@ -1876,7 +1877,7 @@
preempt_enable();
#endif
if (current->set_child_tid)
- put_user(current->pid, current->set_child_tid);
+ put_user(task_pid_vnr(current), current->set_child_tid);
}
/*
@@ -4167,7 +4168,8 @@
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
- return pid ? find_task_by_pid(pid) : current;
+ return pid ?
+ find_task_by_pid_ns(pid, current->nsproxy->pid_ns) : current;
}
/* Actually do priority change: must hold rq lock. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 9892388..d809cdd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -694,7 +694,7 @@
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_USER;
- q->info.si_pid = current->pid;
+ q->info.si_pid = task_pid_vnr(current);
q->info.si_uid = current->uid;
break;
case (unsigned long) SEND_SIG_PRIV:
@@ -1089,7 +1089,7 @@
{
int error;
rcu_read_lock();
- error = kill_pid_info(sig, info, find_pid(pid));
+ error = kill_pid_info(sig, info, find_vpid(pid));
rcu_read_unlock();
return error;
}
@@ -1160,9 +1160,9 @@
read_unlock(&tasklist_lock);
ret = count ? retval : -ESRCH;
} else if (pid < 0) {
- ret = kill_pgrp_info(sig, info, find_pid(-pid));
+ ret = kill_pgrp_info(sig, info, find_vpid(-pid));
} else {
- ret = kill_pid_info(sig, info, find_pid(pid));
+ ret = kill_pid_info(sig, info, find_vpid(pid));
}
rcu_read_unlock();
return ret;
@@ -1266,7 +1266,12 @@
int
kill_proc(pid_t pid, int sig, int priv)
{
- return kill_proc_info(sig, __si_special(priv), pid);
+ int ret;
+
+ rcu_read_lock();
+ ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -1443,7 +1448,22 @@
info.si_signo = sig;
info.si_errno = 0;
- info.si_pid = tsk->pid;
+ /*
+ * we are under tasklist_lock here so our parent is tied to
+ * us and cannot exit and release its namespace.
+ *
+ * the only it can is to switch its nsproxy with sys_unshare,
+ * bu uncharing pid namespaces is not allowed, so we'll always
+ * see relevant namespace
+ *
+ * write_lock() currently calls preempt_disable() which is the
+ * same as rcu_read_lock(), but according to Oleg, this is not
+ * correct to rely on this
+ */
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ rcu_read_unlock();
+
info.si_uid = tsk->uid;
/* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1508,7 +1528,13 @@
info.si_signo = SIGCHLD;
info.si_errno = 0;
- info.si_pid = tsk->pid;
+ /*
+ * see comment in do_notify_parent() abot the following 3 lines
+ */
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ rcu_read_unlock();
+
info.si_uid = tsk->uid;
/* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1634,7 +1660,7 @@
memset(&info, 0, sizeof info);
info.si_signo = SIGTRAP;
info.si_code = exit_code;
- info.si_pid = current->pid;
+ info.si_pid = task_pid_vnr(current);
info.si_uid = current->uid;
/* Let the debugger run. */
@@ -1804,7 +1830,7 @@
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
- info->si_pid = current->parent->pid;
+ info->si_pid = task_pid_vnr(current->parent);
info->si_uid = current->parent->uid;
}
@@ -2191,7 +2217,7 @@
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
- info.si_pid = current->tgid;
+ info.si_pid = task_tgid_vnr(current);
info.si_uid = current->uid;
return kill_something_info(sig, &info, pid);
@@ -2207,12 +2233,12 @@
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
- info.si_pid = current->tgid;
+ info.si_pid = task_tgid_vnr(current);
info.si_uid = current->uid;
read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- if (p && (tgid <= 0 || p->tgid == tgid)) {
+ p = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
+ if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
error = check_kill_permission(sig, &info, p);
/*
* The null signal is a permissions and process existence
diff --git a/kernel/sys.c b/kernel/sys.c
index 4cfa213..23620d5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -152,7 +152,8 @@
switch (which) {
case PRIO_PROCESS:
if (who)
- p = find_task_by_pid(who);
+ p = find_task_by_pid_ns(who,
+ current->nsproxy->pid_ns);
else
p = current;
if (p)
@@ -160,7 +161,7 @@
break;
case PRIO_PGRP:
if (who)
- pgrp = find_pid(who);
+ pgrp = find_vpid(who);
else
pgrp = task_pgrp(current);
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -209,7 +210,8 @@
switch (which) {
case PRIO_PROCESS:
if (who)
- p = find_task_by_pid(who);
+ p = find_task_by_pid_ns(who,
+ current->nsproxy->pid_ns);
else
p = current;
if (p) {
@@ -220,7 +222,7 @@
break;
case PRIO_PGRP:
if (who)
- pgrp = find_pid(who);
+ pgrp = find_vpid(who);
else
pgrp = task_pgrp(current);
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -917,9 +919,10 @@
struct task_struct *p;
struct task_struct *group_leader = current->group_leader;
int err = -EINVAL;
+ struct pid_namespace *ns;
if (!pid)
- pid = group_leader->pid;
+ pid = task_pid_vnr(group_leader);
if (!pgid)
pgid = pid;
if (pgid < 0)
@@ -928,10 +931,12 @@
/* From this point forward we keep holding onto the tasklist lock
* so that our parent does not change from under us. -DaveM
*/
+ ns = current->nsproxy->pid_ns;
+
write_lock_irq(&tasklist_lock);
err = -ESRCH;
- p = find_task_by_pid(pid);
+ p = find_task_by_pid_ns(pid, ns);
if (!p)
goto out;
@@ -957,9 +962,9 @@
goto out;
if (pgid != pid) {
- struct task_struct *g =
- find_task_by_pid_type(PIDTYPE_PGID, pgid);
+ struct task_struct *g;
+ g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
if (!g || task_session(g) != task_session(group_leader))
goto out;
}
@@ -968,10 +973,13 @@
if (err)
goto out;
- if (task_pgrp_nr(p) != pgid) {
+ if (task_pgrp_nr_ns(p, ns) != pgid) {
+ struct pid *pid;
+
detach_pid(p, PIDTYPE_PGID);
- p->signal->pgrp = pgid;
- attach_pid(p, PIDTYPE_PGID, find_pid(pgid));
+ pid = find_vpid(pgid);
+ attach_pid(p, PIDTYPE_PGID, pid);
+ p->signal->pgrp = pid_nr(pid);
}
err = 0;
@@ -984,19 +992,21 @@
asmlinkage long sys_getpgid(pid_t pid)
{
if (!pid)
- return task_pgrp_nr(current);
+ return task_pgrp_vnr(current);
else {
int retval;
struct task_struct *p;
+ struct pid_namespace *ns;
+
+ ns = current->nsproxy->pid_ns;
read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
-
+ p = find_task_by_pid_ns(pid, ns);
retval = -ESRCH;
if (p) {
retval = security_task_getpgid(p);
if (!retval)
- retval = task_pgrp_nr(p);
+ retval = task_pgrp_nr_ns(p, ns);
}
read_unlock(&tasklist_lock);
return retval;
@@ -1008,7 +1018,7 @@
asmlinkage long sys_getpgrp(void)
{
/* SMP - assuming writes are word atomic this is fine */
- return task_pgrp_nr(current);
+ return task_pgrp_vnr(current);
}
#endif
@@ -1016,19 +1026,21 @@
asmlinkage long sys_getsid(pid_t pid)
{
if (!pid)
- return task_session_nr(current);
+ return task_session_vnr(current);
else {
int retval;
struct task_struct *p;
+ struct pid_namespace *ns;
+
+ ns = current->nsproxy->pid_ns;
read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
-
+ p = find_task_by_pid_ns(pid, ns);
retval = -ESRCH;
if (p) {
retval = security_task_getsid(p);
if (!retval)
- retval = task_session_nr(p);
+ retval = task_session_nr_ns(p, ns);
}
read_unlock(&tasklist_lock);
return retval;
@@ -1065,7 +1077,7 @@
group_leader->signal->tty = NULL;
spin_unlock(&group_leader->sighand->siglock);
- err = task_pgrp_nr(group_leader);
+ err = task_pgrp_vnr(group_leader);
out:
write_unlock_irq(&tasklist_lock);
return err;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 44868e4..3b4efbe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2278,7 +2278,7 @@
pid_t tmp;
int r;
- tmp = pid_nr(cad_pid);
+ tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
lenp, ppos, NULL, NULL);
diff --git a/kernel/timer.c b/kernel/timer.c
index 8521d10..fb4e67d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
@@ -956,7 +957,7 @@
*/
asmlinkage long sys_getpid(void)
{
- return current->tgid;
+ return task_tgid_vnr(current);
}
/*
@@ -970,7 +971,7 @@
int pid;
rcu_read_lock();
- pid = rcu_dereference(current->real_parent)->tgid;
+ pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns);
rcu_read_unlock();
return pid;
@@ -1102,7 +1103,7 @@
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid(void)
{
- return current->pid;
+ return task_pid_vnr(current);
}
/**