sched: group scheduling, sysfs tunables

Add tunables in sysfs to modify a user's cpu share.

A directory is created in sysfs for each new user in the system.

	/sys/kernel/uids/<uid>/cpu_share

Reading this file returns the cpu shares granted for the user.
Writing into this file modifies the cpu share for the user. Only an
administrator is allowed to modify a user's cpu share.

Ex:
	# cd /sys/kernel/uids/
	# cat 512/cpu_share
	1024
	# echo 2048 > 512/cpu_share
	# cat 512/cpu_share
	2048
	#

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 84901e7..88bcb87 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@
    iterators of the scheduling modules are used. The balancing code got
    quite a bit simpler as a result.
 
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+	- Based on user id (CONFIG_FAIR_USER_SCHED)
+		In this option, tasks are grouped according to their user id.
+	- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+		This options lets the administrator create arbitrary groups
+		of tasks, using the "cgroup" pseudo filesystem. See
+		Documentation/cgroups.txt for more information about this
+		filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+	# cd /sys/kernel/uids
+	# cat 512/cpu_share		# Display user 512's CPU share
+	1024
+	# echo 2048 > 512/cpu_share	# Modify user 512's CPU share
+	# cat 512/cpu_share		# Display user 512's CPU share
+	2048
+	#
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+	# mkdir /dev/cpuctl
+	# mount -t cgroup -ocpu none /dev/cpuctl
+	# cd /dev/cpuctl
+
+	# mkdir multimedia	# create "multimedia" group of tasks
+	# mkdir browser		# create "browser" group of tasks
+
+	# #Configure the multimedia group to receive twice the CPU bandwidth
+	# #that of browser group
+
+	# echo 2048 > multimedia/cpu.shares
+	# echo 1024 > browser/cpu.shares
+
+	# firefox &	# Launch firefox and move it to "browser" group
+	# echo <firefox_pid> > browser/tasks
+
+	# #Launch gmplayer (or your favourite movie player)
+	# echo <movie_player_pid> > multimedia/tasks
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3cddbfc..04233c8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -87,6 +87,7 @@
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
+#include <linux/kobject.h>
 
 #include <asm/processor.h>
 
@@ -599,9 +600,18 @@
 
 #ifdef CONFIG_FAIR_USER_SCHED
 	struct task_group *tg;
+	struct kset kset;
+	struct subsys_attribute user_attr;
+	struct work_struct work;
 #endif
 };
 
+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
 extern struct user_struct *find_user(uid_t);
 
 extern struct user_struct root_user;
@@ -1848,6 +1858,7 @@
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
 
 #endif
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48..6046939 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
+#include <linux/sched.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@
 					      &notes_attr);
 	}
 
+	/*
+	 * Create "/sys/kernel/uids" directory and corresponding root user's
+	 * directory under it.
+	 */
+	if (!error)
+		error = uids_kobject_init();
+
 	return error;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a3c3ec8..9ac9989 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -162,6 +162,8 @@
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+	/* spinlock to serialize modification to shares */
+	spinlock_t lock;
 };
 
 /* Default task group's sched entity on each cpu */
@@ -6533,6 +6535,7 @@
 			se->parent = NULL;
 		}
 		init_task_group.shares = init_task_group_load;
+		spin_lock_init(&init_task_group.lock);
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6777,6 +6780,7 @@
 	}
 
 	tg->shares = NICE_0_LOAD;
+	spin_lock_init(&tg->lock);
 
 	return tg;
 
@@ -6897,8 +6901,9 @@
 {
 	int i;
 
+	spin_lock(&tg->lock);
 	if (tg->shares == shares)
-		return 0;
+		goto done;
 
 	/* return -EINVAL if the new value is not sane */
 
@@ -6906,7 +6911,14 @@
 	for_each_possible_cpu(i)
 		set_se_shares(tg->se[i], shares);
 
+done:
+	spin_unlock(&tg->lock);
 	return 0;
 }
 
+unsigned long sched_group_shares(struct task_group *tg)
+{
+	return tg->shares;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6f87b31..0aab455 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -231,45 +231,6 @@
 	sched_debug_show(NULL, NULL);
 }
 
-#ifdef CONFIG_FAIR_USER_SCHED
-
-static DEFINE_MUTEX(root_user_share_mutex);
-
-static int
-root_user_share_read_proc(char *page, char **start, off_t off, int count,
-				 int *eof, void *data)
-{
-	return sprintf(page, "%d\n", init_task_group_load);
-}
-
-static int
-root_user_share_write_proc(struct file *file, const char __user *buffer,
-				 unsigned long count, void *data)
-{
-	unsigned long shares;
-	char kbuf[sizeof(unsigned long)+1];
-	int rc = 0;
-
-	if (copy_from_user(kbuf, buffer, sizeof(kbuf)))
-		return -EFAULT;
-
-	shares = simple_strtoul(kbuf, NULL, 0);
-
-	if (!shares)
-		shares = NICE_0_LOAD;
-
-	mutex_lock(&root_user_share_mutex);
-
-	init_task_group_load = shares;
-	rc = sched_group_set_shares(&init_task_group, shares);
-
-	mutex_unlock(&root_user_share_mutex);
-
-	return (rc < 0 ? rc : count);
-}
-
-#endif	/* CONFIG_FAIR_USER_SCHED */
-
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_debug_show, NULL);
@@ -292,15 +253,6 @@
 
 	pe->proc_fops = &sched_debug_fops;
 
-#ifdef CONFIG_FAIR_USER_SCHED
-	pe = create_proc_entry("root_user_cpu_share", 0644, NULL);
-	if (!pe)
-		return -ENOMEM;
-
-	pe->read_proc = root_user_share_read_proc;
-	pe->write_proc = root_user_share_write_proc;
-#endif
-
 	return 0;
 }
 
diff --git a/kernel/user.c b/kernel/user.c
index 0c9a787..74cadea 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,7 +55,41 @@
 #endif
 };
 
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static inline void uid_hash_insert(struct user_struct *up,
+						struct hlist_head *hashent)
+{
+	hlist_add_head(&up->uidhash_node, hashent);
+}
+
+static inline void uid_hash_remove(struct user_struct *up)
+{
+	hlist_del_init(&up->uidhash_node);
+}
+
+static inline struct user_struct *uid_hash_find(uid_t uid,
+						struct hlist_head *hashent)
+{
+	struct user_struct *user;
+	struct hlist_node *h;
+
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
+		if (user->uid == uid) {
+			atomic_inc(&user->__count);
+			return user;
+		}
+	}
+
+	return NULL;
+}
+
 #ifdef CONFIG_FAIR_USER_SCHED
+
+static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static DEFINE_MUTEX(uids_mutex);
+
 static void sched_destroy_user(struct user_struct *up)
 {
 	sched_destroy_group(up->tg);
@@ -77,43 +111,174 @@
 	sched_move_task(p);
 }
 
+static inline void uids_mutex_lock(void)
+{
+	mutex_lock(&uids_mutex);
+}
+
+static inline void uids_mutex_unlock(void)
+{
+	mutex_unlock(&uids_mutex);
+}
+
+/* return cpu shares held by the user */
+ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+
+	return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+}
+
+/* modify cpu shares held by the user */
+ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	unsigned long shares;
+	int rc;
+
+	sscanf(buffer, "%lu", &shares);
+
+	rc = sched_group_set_shares(up->tg, shares);
+
+	return (rc ? rc : size);
+}
+
+static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+{
+	sa->attr.name = name;
+	sa->attr.mode = mode;
+	sa->show = cpu_shares_show;
+	sa->store = cpu_shares_store;
+}
+
+/* Create "/sys/kernel/uids/<uid>" directory and
+ *  "/sys/kernel/uids/<uid>/cpu_share" file for this user.
+ */
+static int user_kobject_create(struct user_struct *up)
+{
+	struct kset *kset = &up->kset;
+	struct kobject *kobj = &kset->kobj;
+	int error;
+
+	memset(kset, 0, sizeof(struct kset));
+	kobj->parent = &uids_kobject;	/* create under /sys/kernel/uids dir */
+	kobject_set_name(kobj, "%d", up->uid);
+	kset_init(kset);
+	user_attr_init(&up->user_attr, "cpu_share", 0644);
+
+	error = kobject_add(kobj);
+	if (error)
+		goto done;
+
+	error = sysfs_create_file(kobj, &up->user_attr.attr);
+	if (error)
+		kobject_del(kobj);
+
+done:
+	return error;
+}
+
+/* create these in sysfs filesystem:
+ * 	"/sys/kernel/uids" directory
+ * 	"/sys/kernel/uids/0" directory (for root user)
+ * 	"/sys/kernel/uids/0/cpu_share" file (for root user)
+ */
+int __init uids_kobject_init(void)
+{
+	int error;
+
+	/* create under /sys/kernel dir */
+	uids_kobject.parent = &kernel_subsys.kobj;
+	kobject_set_name(&uids_kobject, "uids");
+	kobject_init(&uids_kobject);
+
+	error = kobject_add(&uids_kobject);
+	if (!error)
+		error = user_kobject_create(&root_user);
+
+	return error;
+}
+
+/* work function to remove sysfs directory for a user and free up
+ * corresponding structures.
+ */
+static void remove_user_sysfs_dir(struct work_struct *w)
+{
+	struct user_struct *up = container_of(w, struct user_struct, work);
+	struct kobject *kobj = &up->kset.kobj;
+	unsigned long flags;
+	int remove_user = 0;
+
+	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
+	 * atomic.
+	 */
+	uids_mutex_lock();
+
+	local_irq_save(flags);
+
+	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+		uid_hash_remove(up);
+		remove_user = 1;
+		spin_unlock_irqrestore(&uidhash_lock, flags);
+	} else {
+		local_irq_restore(flags);
+	}
+
+	if (!remove_user)
+		goto done;
+
+	sysfs_remove_file(kobj, &up->user_attr.attr);
+	kobject_del(kobj);
+
+	sched_destroy_user(up);
+	key_put(up->uid_keyring);
+	key_put(up->session_keyring);
+	kmem_cache_free(uid_cachep, up);
+
+done:
+	uids_mutex_unlock();
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+	/* restore back the count */
+	atomic_inc(&up->__count);
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+
+	INIT_WORK(&up->work, remove_user_sysfs_dir);
+	schedule_work(&up->work);
+}
+
 #else	/* CONFIG_FAIR_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
 static void sched_switch_user(struct task_struct *p) { }
+static inline int user_kobject_create(struct user_struct *up) { return 0; }
+static inline void uids_mutex_lock(void) { }
+static inline void uids_mutex_unlock(void) { }
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+	uid_hash_remove(up);
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+	sched_destroy_user(up);
+	key_put(up->uid_keyring);
+	key_put(up->session_keyring);
+	kmem_cache_free(uid_cachep, up);
+}
 
 #endif	/* CONFIG_FAIR_USER_SCHED */
 
 /*
- * These routines must be called with the uidhash spinlock held!
- */
-static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
-{
-	hlist_add_head(&up->uidhash_node, hashent);
-}
-
-static inline void uid_hash_remove(struct user_struct *up)
-{
-	hlist_del_init(&up->uidhash_node);
-}
-
-static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-	struct user_struct *user;
-	struct hlist_node *h;
-
-	hlist_for_each_entry(user, h, hashent, uidhash_node) {
-		if(user->uid == uid) {
-			atomic_inc(&user->__count);
-			return user;
-		}
-	}
-
-	return NULL;
-}
-
-/*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
  *
@@ -139,16 +304,10 @@
 		return;
 
 	local_irq_save(flags);
-	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
-		uid_hash_remove(up);
-		spin_unlock_irqrestore(&uidhash_lock, flags);
-		sched_destroy_user(up);
-		key_put(up->uid_keyring);
-		key_put(up->session_keyring);
-		kmem_cache_free(uid_cachep, up);
-	} else {
+	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+		free_user(up, flags);
+	else
 		local_irq_restore(flags);
-	}
 }
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -156,6 +315,11 @@
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
+	/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+	 * atomic.
+	 */
+	uids_mutex_lock();
+
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
@@ -191,6 +355,15 @@
 			return NULL;
 		}
 
+		if (user_kobject_create(new)) {
+			sched_destroy_user(new);
+			key_put(new->uid_keyring);
+			key_put(new->session_keyring);
+			kmem_cache_free(uid_cachep, new);
+			uids_mutex_unlock();
+			return NULL;
+		}
+
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
@@ -198,7 +371,11 @@
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			sched_destroy_user(new);
+			/* This case is not possible when CONFIG_FAIR_USER_SCHED
+			 * is defined, since we serialize alloc_uid() using
+			 * uids_mutex. Hence no need to call
+			 * sched_destroy_user() or remove_user_sysfs_dir().
+			 */
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
@@ -209,6 +386,9 @@
 		spin_unlock_irq(&uidhash_lock);
 
 	}
+
+	uids_mutex_unlock();
+
 	return up;
 }