ftrace: add tracing of context switches

This patch adds context switch tracing, of the format of:

                  _------=> CPU#
                 / _-----=> irqs-off
                | / _----=> need-resched
                || / _---=> hardirq/softirq
                ||| / _--=> preempt-depth
                |||| /
                |||||     delay
    cmd     pid ||||| time  |      pid:prio:state
       \   /    |||||   \   |      /
  swapper-0     1d..3    137us+:  0:140:R --> 2912:120
     sshd-2912  1d..3    216us+:  2912:120:S --> 0:140
  swapper-0     1d..3    261us+:  0:140:R --> 2912:120
     bash-2920  0d..3    267us+:  2920:120:S --> 0:140
     sshd-2912  1d..3    330us!:  2912:120:S --> 0:140
  swapper-0     1d..3   2389us+:  0:140:R --> 2847:120
 yum-upda-2847  1d..3   2411us!:  2847:120:S --> 0:140
  swapper-0     0d..3  11089us+:  0:140:R --> 3139:120
 gdm-bina-3139  0d..3  11113us!:  3139:120:S --> 0:140
  swapper-0     1d..3 102328us+:  0:140:R --> 2847:120
 yum-upda-2847  1d..3 102348us!:  2847:120:S --> 0:140

 "sched_switch" is added to /debugfs/tracing/available_tracers

[ Eugene Teo <eugeneteo@kernel.sg: remove unused tracing_sched_switch_enabled ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1399f37..5d6aa92 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -13,6 +13,7 @@
 	depends on DEBUG_KERNEL && HAVE_FTRACE
 	select FRAME_POINTER
 	select TRACING
+	select CONTEXT_SWITCH_TRACER
 	help
 	  Enable the kernel to trace every kernel function. This is done
 	  by using a compiler feature to insert a small, 5-byte No-Operation
@@ -21,3 +22,13 @@
 	  tracing is enabled by the administrator. If it's runtime disabled
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
+
+config CONTEXT_SWITCH_TRACER
+	bool "Trace process context switches"
+	depends on DEBUG_KERNEL
+	select TRACING
+	select MARKERS
+	help
+	  This tracer gets called from the context switch and records
+	  all switching of tasks.
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6bb5e50..6b54ceb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_FTRACE) += libftrace.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 0000000..3e4771d
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,116 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*ctx_trace;
+static int __read_mostly	tracer_enabled;
+
+static void notrace
+ctx_switch_func(struct task_struct *prev, struct task_struct *next)
+{
+	struct trace_array *tr = ctx_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_switch_trace(tr, data, prev, next, flags);
+
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
+void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+{
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	ctx_switch_func(prev, next);
+
+	/*
+	 * Chain to the wakeup tracer (this is a NOP if disabled):
+	 */
+	wakeup_sched_switch(prev, next);
+}
+
+static notrace void sched_switch_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static notrace void start_sched_trace(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracer_enabled = 1;
+}
+
+static notrace void stop_sched_trace(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+}
+
+static notrace void sched_switch_trace_init(struct trace_array *tr)
+{
+	ctx_trace = tr;
+
+	if (tr->ctrl)
+		start_sched_trace(tr);
+}
+
+static notrace void sched_switch_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_sched_trace(tr);
+	else
+		stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+	.name		= "sched_switch",
+	.init		= sched_switch_trace_init,
+	.reset		= sched_switch_trace_reset,
+	.ctrl_update	= sched_switch_trace_ctrl_update,
+};
+
+__init static int init_sched_switch_trace(void)
+{
+	return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);