Merge branch 'arch-frv' into no-rebases
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ef54a59..138fc9c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -6,6 +6,8 @@
 	select GENERIC_IOMAP
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
+	select GENERIC_KERNEL_EXECVE
+	select GENERIC_KERNEL_THREAD
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_TIME_VSYSCALL
 	select HARDIRQS_SW_RESEND
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5d81004..42471d0 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -126,11 +126,6 @@
 extern struct task_struct *cpu_switch_to(struct task_struct *prev,
 					 struct task_struct *next);
 
-/*
- * Create a new kernel thread
- */
-extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-
 #define task_pt_regs(p) \
 	((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
 
diff --git a/arch/arm64/include/asm/syscalls.h b/arch/arm64/include/asm/syscalls.h
index 09ff335..81680a0 100644
--- a/arch/arm64/include/asm/syscalls.h
+++ b/arch/arm64/include/asm/syscalls.h
@@ -23,9 +23,6 @@
 /*
  * System call wrappers implemented in kernel/entry.S.
  */
-asmlinkage long sys_execve_wrapper(const char __user *filename,
-				   const char __user *const __user *argv,
-				   const char __user *const __user *envp);
 asmlinkage long sys_clone_wrapper(unsigned long clone_flags,
 				  unsigned long newsp,
 				  void __user *parent_tid,
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 63f853f..b40dc6b 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -26,4 +26,5 @@
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
 #endif
+#define __ARCH_WANT_SYS_EXECVE
 #include <uapi/asm/unistd.h>
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 6d909fa..9035e6a 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -32,7 +32,7 @@
 __SYSCALL(8,   sys_creat)
 __SYSCALL(9,   sys_link)
 __SYSCALL(10,  sys_unlink)
-__SYSCALL(11,  compat_sys_execve_wrapper)
+__SYSCALL(11,  compat_sys_execve)
 __SYSCALL(12,  sys_chdir)
 __SYSCALL(13,  sys_ni_syscall)			/* 13 was sys_time */
 __SYSCALL(14,  sys_mknod)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a6f3f7d..6165318 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -594,7 +594,7 @@
 /*
  * "slow" syscall return path.
  */
-ENTRY(ret_to_user)
+ret_to_user:
 	disable_irq				// disable interrupts
 	ldr	x1, [tsk, #TI_FLAGS]
 	and	x2, x1, #_TIF_WORK_MASK
@@ -611,7 +611,10 @@
  */
 ENTRY(ret_from_fork)
 	bl	schedule_tail
-	get_thread_info tsk
+	cbz	x19, 1f				// not a kernel thread
+	mov	x0, x20
+	blr	x19
+1:	get_thread_info tsk
 	b	ret_to_user
 ENDPROC(ret_from_fork)
 
@@ -673,11 +676,6 @@
 /*
  * Special system call wrappers.
  */
-ENTRY(sys_execve_wrapper)
-	mov	x3, sp
-	b	sys_execve
-ENDPROC(sys_execve_wrapper)
-
 ENTRY(sys_clone_wrapper)
 	mov	x5, sp
 	b	sys_clone
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f22965e..bf615e2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -240,27 +240,35 @@
 	struct pt_regs *childregs = task_pt_regs(p);
 	unsigned long tls = p->thread.tp_value;
 
-	*childregs = *regs;
-	childregs->regs[0] = 0;
-
-	if (is_compat_thread(task_thread_info(p)))
-		childregs->compat_sp = stack_start;
-	else {
-		/*
-		 * Read the current TLS pointer from tpidr_el0 as it may be
-		 * out-of-sync with the saved value.
-		 */
-		asm("mrs %0, tpidr_el0" : "=r" (tls));
-		childregs->sp = stack_start;
-	}
-
 	memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
-	p->thread.cpu_context.sp = (unsigned long)childregs;
-	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
 
-	/* If a TLS pointer was passed to clone, use that for the new thread. */
-	if (clone_flags & CLONE_SETTLS)
-		tls = regs->regs[3];
+	if (likely(regs)) {
+		*childregs = *regs;
+		childregs->regs[0] = 0;
+		if (is_compat_thread(task_thread_info(p))) {
+			childregs->compat_sp = stack_start;
+		} else {
+			/*
+			 * Read the current TLS pointer from tpidr_el0 as it may be
+			 * out-of-sync with the saved value.
+			 */
+			asm("mrs %0, tpidr_el0" : "=r" (tls));
+			childregs->sp = stack_start;
+		}
+		/*
+		 * If a TLS pointer was passed to clone (4th argument), use it
+		 * for the new thread.
+		 */
+		if (clone_flags & CLONE_SETTLS)
+			tls = regs->regs[3];
+	} else {
+		memset(childregs, 0, sizeof(struct pt_regs));
+		childregs->pstate = PSR_MODE_EL1h;
+		p->thread.cpu_context.x19 = stack_start;
+		p->thread.cpu_context.x20 = stk_sz;
+	}
+	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
+	p->thread.cpu_context.sp = (unsigned long)childregs;
 	p->thread.tp_value = tls;
 
 	ptrace_hw_copy_thread(p);
@@ -327,43 +335,6 @@
 }
 EXPORT_SYMBOL(dump_fpu);
 
-/*
- * Shuffle the argument into the correct register before calling the
- * thread function.  x1 is the thread argument, x2 is the pointer to
- * the thread function, and x3 points to the exit function.
- */
-extern void kernel_thread_helper(void);
-asm(	".section .text\n"
-"	.align\n"
-"	.type	kernel_thread_helper, #function\n"
-"kernel_thread_helper:\n"
-"	mov	x0, x1\n"
-"	mov	x30, x3\n"
-"	br	x2\n"
-"	.size	kernel_thread_helper, . - kernel_thread_helper\n"
-"	.previous");
-
-#define kernel_thread_exit	do_exit
-
-/*
- * Create a kernel thread.
- */
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
-	struct pt_regs regs;
-
-	memset(&regs, 0, sizeof(regs));
-
-	regs.regs[1] = (unsigned long)arg;
-	regs.regs[2] = (unsigned long)fn;
-	regs.regs[3] = (unsigned long)kernel_thread_exit;
-	regs.pc = (unsigned long)kernel_thread_helper;
-	regs.pstate = PSR_MODE_EL1h;
-
-	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct stackframe frame;
diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
index b120df3..9c77c0b 100644
--- a/arch/arm64/kernel/sys.c
+++ b/arch/arm64/kernel/sys.c
@@ -41,70 +41,6 @@
 	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
 }
 
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage long sys_execve(const char __user *filenamei,
-			   const char __user *const __user *argv,
-			   const char __user *const __user *envp,
-			   struct pt_regs *regs)
-{
-	long error;
-	struct filename *filename;
-
-	filename = getname(filenamei);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = do_execve(filename->name, argv, envp, regs);
-	putname(filename);
-out:
-	return error;
-}
-
-int kernel_execve(const char *filename,
-		  const char *const argv[],
-		  const char *const envp[])
-{
-	struct pt_regs regs;
-	int ret;
-
-	memset(&regs, 0, sizeof(struct pt_regs));
-	ret = do_execve(filename,
-			(const char __user *const __user *)argv,
-			(const char __user *const __user *)envp, &regs);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * Save argc to the register structure for userspace.
-	 */
-	regs.regs[0] = ret;
-
-	/*
-	 * We were successful.  We won't be returning to our caller, but
-	 * instead to user space by manipulating the kernel stack.
-	 */
-	asm(	"add	x0, %0, %1\n\t"
-		"mov	x1, %2\n\t"
-		"mov	x2, %3\n\t"
-		"bl	memmove\n\t"	/* copy regs to top of stack */
-		"mov	x27, #0\n\t"	/* not a syscall */
-		"mov	x28, %0\n\t"	/* thread structure */
-		"mov	sp, x0\n\t"	/* reposition stack pointer */
-		"b	ret_to_user"
-		:
-		: "r" (current_thread_info()),
-		  "Ir" (THREAD_START_SP - sizeof(regs)),
-		  "r" (&regs),
-		  "Ir" (sizeof(regs))
-		: "x0", "x1", "x2", "x27", "x28", "x30", "memory");
-
- out:
-	return ret;
-}
-EXPORT_SYMBOL(kernel_execve);
-
 asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
 			 unsigned long prot, unsigned long flags,
 			 unsigned long fd, off_t off)
@@ -118,7 +54,6 @@
 /*
  * Wrappers to pass the pt_regs argument.
  */
-#define sys_execve		sys_execve_wrapper
 #define sys_clone		sys_clone_wrapper
 #define sys_rt_sigreturn	sys_rt_sigreturn_wrapper
 #define sys_sigaltstack		sys_sigaltstack_wrapper
diff --git a/arch/arm64/kernel/sys32.S b/arch/arm64/kernel/sys32.S
index 54c4aec..92145d4 100644
--- a/arch/arm64/kernel/sys32.S
+++ b/arch/arm64/kernel/sys32.S
@@ -36,11 +36,6 @@
 	b	compat_sys_vfork
 ENDPROC(compat_sys_vfork_wrapper)
 
-compat_sys_execve_wrapper:
-	mov	x3, sp
-	b	compat_sys_execve
-ENDPROC(compat_sys_execve_wrapper)
-
 compat_sys_clone_wrapper:
 	mov	x5, sp
 	b	compat_sys_clone
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 906e3bd..d140b73 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -49,24 +49,6 @@
 		       regs, 0, NULL, NULL);
 }
 
-asmlinkage int compat_sys_execve(const char __user *filenamei,
-				 compat_uptr_t argv, compat_uptr_t envp,
-				 struct pt_regs *regs)
-{
-	int error;
-	struct filename *filename;
-
-	filename = getname(filenamei);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = compat_do_execve(filename->name, compat_ptr(argv),
-					compat_ptr(envp), regs);
-	putname(filename);
-out:
-	return error;
-}
-
 asmlinkage int compat_sys_sched_rr_get_interval(compat_pid_t pid,
 						struct compat_timespec __user *interval)
 {
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index aee1b56..66eab37 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -18,6 +18,7 @@
 	select OF_EARLY_FLATTREE
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 	select MODULES_USE_ELF_RELA
 
 config MMU
diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h
index 4ff747d..625beaf 100644
--- a/arch/c6x/include/uapi/asm/unistd.h
+++ b/arch/c6x/include/uapi/asm/unistd.h
@@ -14,7 +14,6 @@
  *   more details.
  */
 
-#define __ARCH_WANT_KERNEL_EXECVE
 #define __ARCH_WANT_SYS_EXECVE
 
 /* Use the standard ABI for syscalls. */
diff --git a/arch/c6x/kernel/entry.S b/arch/c6x/kernel/entry.S
index 5449c36..75f6f36 100644
--- a/arch/c6x/kernel/entry.S
+++ b/arch/c6x/kernel/entry.S
@@ -413,19 +413,9 @@
 0:
 	B	.S2	B10		   /* call fn */
 	LDW	.D2T1	*+SP(REGS_A1+8),A4 /* get arg */
-	MVKL	.S2	sys_exit,B11
-	MVKH	.S2	sys_exit,B11
-	ADDKPC	.S2	0f,B3,1
-0:
-	BNOP	.S2	B11,5	/* jump to sys_exit */
+	ADDKPC	.S2	ret_from_fork_2,B3,3
 ENDPROC(ret_from_kernel_thread)
 
-ENTRY(ret_from_kernel_execve)
-	GET_THREAD_INFO A12
-	BNOP	.S2	syscall_exit,4
-	ADD	.D2X	A4,-8,SP
-ENDPROC(ret_from_kernel_execve)
-
 	;;
 	;; These are the interrupt handlers, responsible for calling __do_IRQ()
 	;; int6 is used for syscalls (see _system_call entry)
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 0744f7d..e418803 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -31,6 +31,8 @@
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h
index a03323a..6dd5d37 100644
--- a/arch/hexagon/include/asm/processor.h
+++ b/arch/hexagon/include/asm/processor.h
@@ -34,7 +34,6 @@
 struct task_struct;
 
 /*  this is defined in arch/process.c  */
-extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 extern void start_thread(struct pt_regs *, unsigned long, unsigned long);
diff --git a/arch/hexagon/include/uapi/asm/ptrace.h b/arch/hexagon/include/uapi/asm/ptrace.h
index 8ef7840..1ffce0c 100644
--- a/arch/hexagon/include/uapi/asm/ptrace.h
+++ b/arch/hexagon/include/uapi/asm/ptrace.h
@@ -32,4 +32,8 @@
 extern int regs_query_register_offset(const char *name);
 extern const char *regs_query_register_name(unsigned int offset);
 
+#define current_pt_regs() \
+	((struct pt_regs *) \
+	 ((unsigned long)current_thread_info() + THREAD_SIZE) - 1)
+
 #endif
diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h
index 81312d6..26b2e0f 100644
--- a/arch/hexagon/include/uapi/asm/unistd.h
+++ b/arch/hexagon/include/uapi/asm/unistd.h
@@ -27,5 +27,6 @@
  */
 
 #define sys_mmap2 sys_mmap_pgoff
+#define __ARCH_WANT_SYS_EXECVE
 
 #include <asm-generic/unistd.h>
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 9f6d741..cfbc52b 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -26,33 +26,6 @@
 #include <linux/slab.h>
 
 /*
- * Kernel thread creation.  The desired kernel function is "wrapped"
- * in the kernel_thread_helper function, which does cleanup
- * afterwards.
- */
-static void __noreturn kernel_thread_helper(void *arg, int (*fn)(void *))
-{
-	do_exit(fn(arg));
-}
-
-int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
-	struct pt_regs regs;
-
-	memset(&regs, 0, sizeof(regs));
-	/*
-	 * Yes, we're exploting illicit knowledge of the ABI here.
-	 */
-	regs.r00 = (unsigned long) arg;
-	regs.r01 = (unsigned long) fn;
-	pt_set_elr(&regs, (unsigned long)kernel_thread_helper);
-	pt_set_kmode(&regs);
-
-	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
-/*
  * Program thread launch.  Often defined as a macro in processor.h,
  * but we're shooting for a small footprint and it's not an inner-loop
  * performance-critical operation.
@@ -114,7 +87,7 @@
  * Copy architecture-specific thread state
  */
 int copy_thread(unsigned long clone_flags, unsigned long usp,
-		unsigned long unused, struct task_struct *p,
+		unsigned long arg, struct task_struct *p,
 		struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
@@ -125,61 +98,50 @@
 	childregs = (struct pt_regs *) (((unsigned long) ti + THREAD_SIZE) -
 					sizeof(*childregs));
 
-	memcpy(childregs, regs, sizeof(*childregs));
 	ti->regs = childregs;
 
 	/*
 	 * Establish kernel stack pointer and initial PC for new thread
+	 * Note that unlike the usual situation, we do not copy the
+	 * parent's callee-saved here; those are in pt_regs and whatever
+	 * we leave here will be overridden on return to userland.
 	 */
 	ss = (struct hexagon_switch_stack *) ((unsigned long) childregs -
 						    sizeof(*ss));
 	ss->lr = (unsigned long)ret_from_fork;
 	p->thread.switch_sp = ss;
-
-	/* If User mode thread, set pt_reg stack pointer as per parameter */
-	if (user_mode(childregs)) {
-		pt_set_rte_sp(childregs, usp);
-
-		/* Child sees zero return value */
-		childregs->r00 = 0;
-
-		/*
-		 * The clone syscall has the C signature:
-		 * int [r0] clone(int flags [r0],
-		 *           void *child_frame [r1],
-		 *           void *parent_tid [r2],
-		 *           void *child_tid [r3],
-		 *           void *thread_control_block [r4]);
-		 * ugp is used to provide TLS support.
-		 */
-		if (clone_flags & CLONE_SETTLS)
-			childregs->ugp = childregs->r04;
-
-		/*
-		 * Parent sees new pid -- not necessary, not even possible at
-		 * this point in the fork process
-		 * Might also want to set things like ti->addr_limit
-		 */
-	} else {
-		/*
-		 * If kernel thread, resume stack is kernel stack base.
-		 * Note that this is pointer arithmetic on pt_regs *
-		 */
-		pt_set_rte_sp(childregs, (unsigned long)(childregs + 1));
-		/*
-		 * We need the current thread_info fast path pointer
-		 * set up in pt_regs.  The register to be used is
-		 * parametric for assembler code, but the mechanism
-		 * doesn't drop neatly into C.  Needs to be fixed.
-		 */
-		childregs->THREADINFO_REG = (unsigned long) ti;
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		memset(childregs, 0, sizeof(struct pt_regs));
+		/* r24 <- fn, r25 <- arg */
+		ss->r2524 = usp | ((u64)arg << 32);
+		pt_set_kmode(childregs);
+		return 0;
 	}
+	memcpy(childregs, regs, sizeof(*childregs));
+	ss->r2524 = 0;
+
+	pt_set_rte_sp(childregs, usp);
+
+	/* Child sees zero return value */
+	childregs->r00 = 0;
 
 	/*
-	 * thread_info pointer is pulled out of task_struct "stack"
-	 * field on switch_to.
+	 * The clone syscall has the C signature:
+	 * int [r0] clone(int flags [r0],
+	 *           void *child_frame [r1],
+	 *           void *parent_tid [r2],
+	 *           void *child_tid [r3],
+	 *           void *thread_control_block [r4]);
+	 * ugp is used to provide TLS support.
 	 */
-	p->stack = (void *)ti;
+	if (clone_flags & CLONE_SETTLS)
+		childregs->ugp = childregs->r04;
+
+	/*
+	 * Parent sees new pid -- not necessary, not even possible at
+	 * this point in the fork process
+	 * Might also want to set things like ti->addr_limit
+	 */
 
 	return 0;
 }
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 5047b8b..fe0d137 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -249,14 +249,14 @@
  */
 asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss)
 {
-	struct pt_regs *regs = current_thread_info()->regs;
+	struct pt_regs *regs = current_pt_regs();
 
 	return do_sigaltstack(uss, uoss, regs->r29);
 }
 
 asmlinkage int sys_rt_sigreturn(void)
 {
-	struct pt_regs *regs = current_thread_info()->regs;
+	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame;
 	sigset_t blocked;
 
diff --git a/arch/hexagon/kernel/syscall.c b/arch/hexagon/kernel/syscall.c
index 319fa64..d2cc327 100644
--- a/arch/hexagon/kernel/syscall.c
+++ b/arch/hexagon/kernel/syscall.c
@@ -35,55 +35,13 @@
  * See signal.c for signal-related system call wrappers.
  */
 
-asmlinkage int sys_execve(char __user *ufilename,
-			  const char __user *const __user *argv,
-			  const char __user *const __user *envp)
-{
-	struct pt_regs *pregs = current_thread_info()->regs;
-	struct filename *filename;
-	int retval;
-
-	filename = getname(ufilename);
-	retval = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		return retval;
-
-	retval = do_execve(filename->name, argv, envp, pregs);
-	putname(filename);
-
-	return retval;
-}
-
 asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
 			 unsigned long parent_tidp, unsigned long child_tidp)
 {
-	struct pt_regs *pregs = current_thread_info()->regs;
+	struct pt_regs *pregs = current_pt_regs();
 
 	if (!newsp)
 		newsp = pregs->SP;
 	return do_fork(clone_flags, newsp, pregs, 0, (int __user *)parent_tidp,
 		       (int __user *)child_tidp);
 }
-
-/*
- * Do a system call from the kernel, so as to have a proper pt_regs
- * and recycle the sys_execvpe infrustructure.
- */
-int kernel_execve(const char *filename,
-		  const char *const argv[], const char *const envp[])
-{
-	register unsigned long __a0 asm("r0") = (unsigned long) filename;
-	register unsigned long __a1 asm("r1") = (unsigned long) argv;
-	register unsigned long __a2 asm("r2") = (unsigned long) envp;
-	int retval;
-
-	__asm__ volatile(
-		"	R6 = #%4;\n"
-		"	trap0(#1);\n"
-		"	%0 = R0;\n"
-		: "=r" (retval)
-		: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_execve)
-	);
-
-	return retval;
-}
diff --git a/arch/hexagon/kernel/vm_entry.S b/arch/hexagon/kernel/vm_entry.S
index cd71673..425e50c 100644
--- a/arch/hexagon/kernel/vm_entry.S
+++ b/arch/hexagon/kernel/vm_entry.S
@@ -266,4 +266,8 @@
 	.globl ret_from_fork
 ret_from_fork:
 	call schedule_tail
+	P0 = cmp.eq(R24, #0);
+	if P0 jump return_from_syscall
+	R0 = R25;
+	callr R24
 	jump return_from_syscall
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3279646..6706004 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -42,6 +42,8 @@
 	select GENERIC_TIME_VSYSCALL_OLD
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
index 944152a..e0a899a 100644
--- a/arch/ia64/include/asm/processor.h
+++ b/arch/ia64/include/asm/processor.h
@@ -340,22 +340,6 @@
  */
 #define release_thread(dead_task)
 
-/*
- * This is the mechanism for creating a new kernel thread.
- *
- * NOTE 1: Only a kernel-only process (ie the swapper or direct
- * descendants who haven't done an "execve()") should use this: it
- * will work within a system call from a "real" process, but the
- * process memory space will not be free'd until both the parent and
- * the child have exited.
- *
- * NOTE 2: This MUST NOT be an inlined function.  Otherwise, we get
- * into trouble in init/main.c when the child thread returns to
- * do_basic_setup() and the timing is such that free_initmem() has
- * been called already.
- */
-extern pid_t kernel_thread (int (*fn)(void *), void *arg, unsigned long flags);
-
 /* Get wait channel for task P.  */
 extern unsigned long get_wchan (struct task_struct *p);
 
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index 8b3ff2f..1574bca 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -29,6 +29,7 @@
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 #define __ARCH_WANT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_SYS_EXECVE
 
 #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)
 
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 1ccbe12..940a672 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -61,14 +61,13 @@
 	 * Allocate 8 input registers since ptrace() may clobber them
 	 */
 	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,4,0
+	alloc loc1=ar.pfs,8,2,3,0
 	mov loc0=rp
 	.body
 	mov out0=in0			// filename
 	;;				// stop bit between alloc and call
 	mov out1=in1			// argv
 	mov out2=in2			// envp
-	add out3=16,sp			// regs
 	br.call.sptk.many rp=sys_execve
 .ret0:
 	cmp4.ge p6,p7=r8,r0
@@ -76,7 +75,6 @@
 	sxt4 r8=r8			// return 64-bit result
 	;;
 	stf.spill [sp]=f0
-(p6)	cmp.ne pKStk,pUStk=r0,r0	// a successful execve() lands us in user-mode...
 	mov rp=loc0
 (p6)	mov ar.pfs=r0			// clear ar.pfs on success
 (p7)	br.ret.sptk.many rp
@@ -484,19 +482,6 @@
 	br.ret.sptk.many rp
 END(prefetch_stack)
 
-GLOBAL_ENTRY(kernel_execve)
-	rum psr.ac
-	mov r15=__NR_execve			// put syscall number in place
-	break __BREAK_SYSCALL
-	br.ret.sptk.many rp
-END(kernel_execve)
-
-GLOBAL_ENTRY(clone)
-	mov r15=__NR_clone			// put syscall number in place
-	break __BREAK_SYSCALL
-	br.ret.sptk.many rp
-END(clone)
-
 	/*
 	 * Invoke a system call, but do some tracing before and after the call.
 	 * We MUST preserve the current register frame throughout this routine
@@ -600,6 +585,27 @@
 .ret4:	br.cond.sptk ia64_leave_kernel
 END(ia64_strace_leave_kernel)
 
+ENTRY(call_payload)
+	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(0)
+	/* call the kernel_thread payload; fn is in r4, arg - in r5 */
+	alloc loc1=ar.pfs,0,3,1,0
+	mov loc0=rp
+	mov loc2=gp
+	mov out0=r5		// arg
+	ld8 r14 = [r4], 8	// fn.address
+	;;
+	mov b6 = r14
+	ld8 gp = [r4]		// fn.gp
+	;;
+	br.call.sptk.many rp=b6	// fn(arg)
+.ret12:	mov gp=loc2
+	mov rp=loc0
+	mov ar.pfs=loc1
+	/* ... and if it has returned, we are going to userland */
+	cmp.ne pKStk,pUStk=r0,r0
+	br.ret.sptk.many rp
+END(call_payload)
+
 GLOBAL_ENTRY(ia64_ret_from_clone)
 	PT_REGS_UNWIND_INFO(0)
 {	/*
@@ -616,6 +622,7 @@
 	br.call.sptk.many rp=ia64_invoke_schedule_tail
 }
 .ret8:
+(pKStk)	br.call.sptk.many rp=call_payload
 	adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
 	;;
 	ld4 r2=[r2]
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 629a250..4738ff7 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1093,19 +1093,6 @@
 END(cycle_to_cputime)
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 
-GLOBAL_ENTRY(start_kernel_thread)
-	.prologue
-	.save rp, r0				// this is the end of the call-chain
-	.body
-	alloc r2 = ar.pfs, 0, 0, 2, 0
-	mov out0 = r9
-	mov out1 = r11;;
-	br.call.sptk.many rp = kernel_thread_helper;;
-	mov out0 = r8
-	br.call.sptk.many rp = sys_exit;;
-1:	br.sptk.few 1b				// not reached
-END(start_kernel_thread)
-
 #ifdef CONFIG_IA64_BRL_EMU
 
 /*
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 35e106f..25543a2 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -401,64 +401,15 @@
 	struct pt_regs *child_ptregs;
 	int retval = 0;
 
-#ifdef CONFIG_SMP
-	/*
-	 * For SMP idle threads, fork_by_hand() calls do_fork with
-	 * NULL regs.
-	 */
-	if (!regs)
-		return 0;
-#endif
-
-	stack = ((struct switch_stack *) regs) - 1;
-
 	child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
 	child_stack = (struct switch_stack *) child_ptregs - 1;
 
-	/* copy parent's switch_stack & pt_regs to child: */
-	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
-
 	rbs = (unsigned long) current + IA64_RBS_OFFSET;
 	child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
-	rbs_size = stack->ar_bspstore - rbs;
-
-	/* copy the parent's register backing store to the child: */
-	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
-
-	if (likely(user_mode(child_ptregs))) {
-		if (clone_flags & CLONE_SETTLS)
-			child_ptregs->r13 = regs->r16;	/* see sys_clone2() in entry.S */
-		if (user_stack_base) {
-			child_ptregs->r12 = user_stack_base + user_stack_size - 16;
-			child_ptregs->ar_bspstore = user_stack_base;
-			child_ptregs->ar_rnat = 0;
-			child_ptregs->loadrs = 0;
-		}
-	} else {
-		/*
-		 * Note: we simply preserve the relative position of
-		 * the stack pointer here.  There is no need to
-		 * allocate a scratch area here, since that will have
-		 * been taken care of by the caller of sys_clone()
-		 * already.
-		 */
-		child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */
-		child_ptregs->r13 = (unsigned long) p;		/* set `current' pointer */
-	}
-	child_stack->ar_bspstore = child_rbs + rbs_size;
-	child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
 
 	/* copy parts of thread_struct: */
 	p->thread.ksp = (unsigned long) child_stack - 16;
 
-	/* stop some PSR bits from being inherited.
-	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
-	 * therefore we must specify them explicitly here and not include them in
-	 * IA64_PSR_BITS_TO_CLEAR.
-	 */
-	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
-				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
-
 	/*
 	 * NOTE: The calling convention considers all floating point
 	 * registers in the high partition (fph) to be scratch.  Since
@@ -480,8 +431,66 @@
 #	define THREAD_FLAGS_TO_SET	0
 	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
 			   | THREAD_FLAGS_TO_SET);
+
 	ia64_drop_fpu(p);	/* don't pick up stale state from a CPU's fph */
 
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		if (unlikely(!user_stack_base)) {
+			/* fork_idle() called us */
+			return 0;
+		}
+		memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack));
+		child_stack->r4 = user_stack_base;	/* payload */
+		child_stack->r5 = user_stack_size;	/* argument */
+		/*
+		 * Preserve PSR bits, except for bits 32-34 and 37-45,
+		 * which we can't read.
+		 */
+		child_ptregs->cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
+		/* mark as valid, empty frame */
+		child_ptregs->cr_ifs = 1UL << 63;
+		child_stack->ar_fpsr = child_ptregs->ar_fpsr
+			= ia64_getreg(_IA64_REG_AR_FPSR);
+		child_stack->pr = (1 << PRED_KERNEL_STACK);
+		child_stack->ar_bspstore = child_rbs;
+		child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
+
+		/* stop some PSR bits from being inherited.
+		 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
+		 * therefore we must specify them explicitly here and not include them in
+		 * IA64_PSR_BITS_TO_CLEAR.
+		 */
+		child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
+
+		return 0;
+	}
+	stack = ((struct switch_stack *) regs) - 1;
+	/* copy parent's switch_stack & pt_regs to child: */
+	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
+
+	/* copy the parent's register backing store to the child: */
+	rbs_size = stack->ar_bspstore - rbs;
+	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
+	if (clone_flags & CLONE_SETTLS)
+		child_ptregs->r13 = regs->r16;	/* see sys_clone2() in entry.S */
+	if (user_stack_base) {
+		child_ptregs->r12 = user_stack_base + user_stack_size - 16;
+		child_ptregs->ar_bspstore = user_stack_base;
+		child_ptregs->ar_rnat = 0;
+		child_ptregs->loadrs = 0;
+	}
+	child_stack->ar_bspstore = child_rbs + rbs_size;
+	child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
+
+	/* stop some PSR bits from being inherited.
+	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
+	 * therefore we must specify them explicitly here and not include them in
+	 * IA64_PSR_BITS_TO_CLEAR.
+	 */
+	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
+
 #ifdef CONFIG_PERFMON
 	if (current->thread.pfm_context)
 		pfm_inherit(p, child_ptregs);
@@ -608,57 +617,6 @@
 	return 1;	/* f0-f31 are always valid so we always return 1 */
 }
 
-long
-sys_execve (const char __user *filename,
-	    const char __user *const __user *argv,
-	    const char __user *const __user *envp,
-	    struct pt_regs *regs)
-{
-	struct filename *fname;
-	int error;
-
-	fname = getname(filename);
-	error = PTR_ERR(fname);
-	if (IS_ERR(fname))
-		goto out;
-	error = do_execve(fname->name, argv, envp, regs);
-	putname(fname);
-out:
-	return error;
-}
-
-pid_t
-kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
-{
-	extern void start_kernel_thread (void);
-	unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
-	struct {
-		struct switch_stack sw;
-		struct pt_regs pt;
-	} regs;
-
-	memset(&regs, 0, sizeof(regs));
-	regs.pt.cr_iip = helper_fptr[0];	/* set entry point (IP) */
-	regs.pt.r1 = helper_fptr[1];		/* set GP */
-	regs.pt.r9 = (unsigned long) fn;	/* 1st argument */
-	regs.pt.r11 = (unsigned long) arg;	/* 2nd argument */
-	/* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
-	regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
-	regs.pt.cr_ifs = 1UL << 63;		/* mark as valid, empty frame */
-	regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
-	regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
-	regs.sw.pr = (1 << PRED_KERNEL_STACK);
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
-/* This gets called from kernel_thread() via ia64_invoke_thread_helper().  */
-int
-kernel_thread_helper (int (*fn)(void *), void *arg)
-{
-	return (*fn)(arg);
-}
-
 /*
  * Flush thread state.  This is called when a thread does an execve().
  */
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index e7c1614..953a7ba 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -16,6 +16,7 @@
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE
 	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_REL
 	select MODULES_USE_ELF_RELA
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 5fc7f7b..0aa1654 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -32,7 +32,6 @@
 #define __ARCH_WANT_SYS_RT_SIGACTION
 #define __ARCH_WANT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_SYS_EXECVE
-#define __ARCH_WANT_KERNEL_EXECVE
 
 /*
  * "Conditional" syscalls
diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 946cb01..45b2f29 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -115,16 +115,9 @@
 	| a3 contains the kernel thread payload, d7 - its argument
 	movel	%d1,%sp@-
 	jsr	schedule_tail
-	GET_CURRENT(%d0)
 	movel	%d7,(%sp)
 	jsr	%a3@
 	addql	#4,%sp
-	movel	%d0,(%sp)
-	jra	sys_exit
-
-ENTRY(ret_from_kernel_execve)
-	movel	4(%sp), %sp
-	GET_CURRENT(%d0)
 	jra	ret_from_exception
 
 #if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index dba9390..4183e62 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -40,6 +40,8 @@
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_REL
 	select MODULES_USE_ELF_RELA if 64BIT
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 
 menu "Machine selection"
 
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index 5e33fab..d28c41e 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -310,8 +310,6 @@
 /* Free all resources held by a thread. */
 #define release_thread(thread) do { } while(0)
 
-extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 /*
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index 4f5da94..cec5e12 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -61,4 +61,10 @@
 		die(str, regs);
 }
 
+#define current_pt_regs()						\
+({									\
+	unsigned long sp = (unsigned long)__builtin_frame_address(0);	\
+	(struct pt_regs *)((sp | (THREAD_SIZE - 1)) + 1 - 32) - 1;	\
+})
+
 #endif /* _ASM_PTRACE_H */
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index 9e47cc1..b306e20 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -20,6 +20,7 @@
 #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_SYS_ALARM
+#define __ARCH_WANT_SYS_EXECVE
 #define __ARCH_WANT_SYS_GETHOSTNAME
 #define __ARCH_WANT_SYS_IPC
 #define __ARCH_WANT_SYS_PAUSE
diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S
index a6c1332..3320cb4 100644
--- a/arch/mips/kernel/entry.S
+++ b/arch/mips/kernel/entry.S
@@ -65,6 +65,12 @@
 	b	need_resched
 #endif
 
+FEXPORT(ret_from_kernel_thread)
+	jal	schedule_tail		# a0 = struct task_struct *prev
+	move	a0, s1
+	jal	s0
+	j	syscall_exit
+
 FEXPORT(ret_from_fork)
 	jal	schedule_tail		# a0 = struct task_struct *prev
 
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 3a21ace..8796dbc 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -3,7 +3,6 @@
  *
  * Copyright (C) 2000 Silicon Graphics, Inc.
  * Written by Ulf Carlsson (ulfc@engr.sgi.com)
- * sys32_execve from ia64/ia32 code, Feb 2000, Kanoj Sarcar (kanoj@sgi.com)
  */
 #include <linux/compiler.h>
 #include <linux/mm.h>
@@ -77,26 +76,6 @@
 	return error;
 }
 
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage int sys32_execve(nabi_no_regargs struct pt_regs regs)
-{
-	int error;
-	struct filename *filename;
-
-	filename = getname(compat_ptr(regs.regs[4]));
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = compat_do_execve(filename->name, compat_ptr(regs.regs[5]),
-				 compat_ptr(regs.regs[6]), &regs);
-	putname(filename);
-
-out:
-	return error;
-}
-
 #define RLIM_INFINITY32	0x7fffffff
 #define RESOURCE32(x) ((x > RLIM_INFINITY32) ? RLIM_INFINITY32 : x)
 
diff --git a/arch/mips/kernel/mips_ksyms.c b/arch/mips/kernel/mips_ksyms.c
index 3fc1691..2d9304c 100644
--- a/arch/mips/kernel/mips_ksyms.c
+++ b/arch/mips/kernel/mips_ksyms.c
@@ -32,8 +32,6 @@
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 
-EXPORT_SYMBOL(kernel_thread);
-
 /*
  * Functions that operate on entire pages.  Mostly used by memory management.
  */
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index e9a5fd7..d13720a 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -84,6 +84,7 @@
 }
 
 asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_kernel_thread(void);
 
 void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
 {
@@ -113,7 +114,7 @@
 }
 
 int copy_thread(unsigned long clone_flags, unsigned long usp,
-	unsigned long unused, struct task_struct *p, struct pt_regs *regs)
+	unsigned long arg, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
 	struct pt_regs *childregs;
@@ -136,19 +137,30 @@
 	childregs = (struct pt_regs *) childksp - 1;
 	/*  Put the stack after the struct pt_regs.  */
 	childksp = (unsigned long) childregs;
+	p->thread.cp0_status = read_c0_status() & ~(ST0_CU2|ST0_CU1);
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		unsigned long status = p->thread.cp0_status;
+		memset(childregs, 0, sizeof(struct pt_regs));
+		ti->addr_limit = KERNEL_DS;
+		p->thread.reg16 = usp; /* fn */
+		p->thread.reg17 = arg;
+		p->thread.reg29 = childksp;
+		p->thread.reg31 = (unsigned long) ret_from_kernel_thread;
+#if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX)
+		status = (status & ~(ST0_KUP | ST0_IEP | ST0_IEC)) |
+			 ((status & (ST0_KUC | ST0_IEC)) << 2);
+#else
+		status |= ST0_EXL;
+#endif
+		childregs->cp0_status = status;
+		return 0;
+	}
 	*childregs = *regs;
 	childregs->regs[7] = 0;	/* Clear error flag */
-
 	childregs->regs[2] = 0;	/* Child gets zero as return value */
+	childregs->regs[29] = usp;
+	ti->addr_limit = USER_DS;
 
-	if (childregs->cp0_status & ST0_CU0) {
-		childregs->regs[28] = (unsigned long) ti;
-		childregs->regs[29] = childksp;
-		ti->addr_limit = KERNEL_DS;
-	} else {
-		childregs->regs[29] = usp;
-		ti->addr_limit = USER_DS;
-	}
 	p->thread.reg29 = (unsigned long) childregs;
 	p->thread.reg31 = (unsigned long) ret_from_fork;
 
@@ -156,7 +168,6 @@
 	 * New tasks lose permission to use the fpu. This accelerates context
 	 * switching for most programs since they don't use the fpu.
 	 */
-	p->thread.cp0_status = read_c0_status() & ~(ST0_CU2|ST0_CU1);
 	childregs->cp0_status &= ~(ST0_CU2|ST0_CU1);
 
 #ifdef CONFIG_MIPS_MT_SMTC
@@ -222,35 +233,6 @@
 }
 
 /*
- * Create a kernel thread
- */
-static void __noreturn kernel_thread_helper(void *arg, int (*fn)(void *))
-{
-	do_exit(fn(arg));
-}
-
-long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
-	struct pt_regs regs;
-
-	memset(&regs, 0, sizeof(regs));
-
-	regs.regs[4] = (unsigned long) arg;
-	regs.regs[5] = (unsigned long) fn;
-	regs.cp0_epc = (unsigned long) kernel_thread_helper;
-	regs.cp0_status = read_c0_status();
-#if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX)
-	regs.cp0_status = (regs.cp0_status & ~(ST0_KUP | ST0_IEP | ST0_IEC)) |
-			  ((regs.cp0_status & (ST0_KUC | ST0_IEC)) << 2);
-#else
-	regs.cp0_status |= ST0_EXL;
-#endif
-
-	/* Ok, create the new process.. */
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-}
-
-/*
  *
  */
 struct mips_frame_info {
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index f6ba838..d27ca34 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -167,7 +167,7 @@
 	PTR	sys_getsockopt
 	PTR	sys_clone			/* 6055 */
 	PTR	sys_fork
-	PTR	sys32_execve
+	PTR	compat_sys_execve
 	PTR	sys_exit
 	PTR	compat_sys_wait4
 	PTR	sys_kill			/* 6060 */
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 53c2d72..9601be6 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -203,7 +203,7 @@
 	PTR	sys_creat
 	PTR	sys_link
 	PTR	sys_unlink			/* 4010 */
-	PTR	sys32_execve
+	PTR	compat_sys_execve
 	PTR	sys_chdir
 	PTR	compat_sys_time
 	PTR	sys_mknod
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 2bd561b..c611e2d 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -127,28 +127,6 @@
 	               parent_tidptr, child_tidptr);
 }
 
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage int sys_execve(nabi_no_regargs struct pt_regs regs)
-{
-	int error;
-	struct filename *filename;
-
-	filename = getname((const char __user *) (long)regs.regs[4]);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = do_execve(filename->name,
-			  (const char __user *const __user *) (long)regs.regs[5],
-	                  (const char __user *const __user *) (long)regs.regs[6],
-			  &regs);
-	putname(filename);
-
-out:
-	return error;
-}
-
 SYSCALL_DEFINE1(set_thread_area, unsigned long, addr)
 {
 	struct thread_info *ti = task_thread_info(current);
@@ -313,34 +291,3 @@
 {
 	do_exit(SIGSEGV);
 }
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename,
-		  const char *const argv[],
-		  const char *const envp[])
-{
-	register unsigned long __a0 asm("$4") = (unsigned long) filename;
-	register unsigned long __a1 asm("$5") = (unsigned long) argv;
-	register unsigned long __a2 asm("$6") = (unsigned long) envp;
-	register unsigned long __a3 asm("$7");
-	unsigned long __v0;
-
-	__asm__ volatile ("					\n"
-	"	.set	noreorder				\n"
-	"	li	$2, %5		# __NR_execve		\n"
-	"	syscall						\n"
-	"	move	%0, $2					\n"
-	"	.set	reorder					\n"
-	: "=&r" (__v0), "=r" (__a3)
-	: "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_execve)
-	: "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24",
-	  "memory");
-
-	if (__a3 == 0)
-		return __v0;
-
-	return -__v0;
-}
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 05f2ba4..e7f1a29 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -22,6 +22,8 @@
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select MODULES_USE_ELF_RELA
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 
 config MMU
 	def_bool y
diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h
index 437bdbb..5db7bc0 100644
--- a/arch/openrisc/include/uapi/asm/unistd.h
+++ b/arch/openrisc/include/uapi/asm/unistd.h
@@ -20,6 +20,8 @@
 
 #define sys_mmap2 sys_mmap_pgoff
 
+#define __ARCH_WANT_SYS_EXECVE
+
 #include <asm-generic/unistd.h>
 
 #define __NR_or1k_atomic __NR_arch_specific_syscall
diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S
index ddfcaa8..c60a09d 100644
--- a/arch/openrisc/kernel/entry.S
+++ b/arch/openrisc/kernel/entry.S
@@ -894,6 +894,16 @@
 	l.jal	schedule_tail
 	 l.nop
 
+	/* Check if we are a kernel thread */
+	l.sfeqi	r20,0
+	l.bf	1f
+	 l.nop
+
+	/* ...we are a kernel thread so invoke the requested callback */
+	l.jalr	r20
+	 l.or	r3,r22,r0
+
+1:
 	/* _syscall_returns expect r11 to contain return value */
 	l.lwz	r11,PT_GPR11(r1)
 
@@ -915,26 +925,6 @@
 	l.j	_syscall_return
 	 l.nop
 
-/* Since syscalls don't save call-clobbered registers, the args to
- * kernel_thread_helper will need to be passed through callee-saved
- * registers and copied to the parameter registers when the thread
- * begins running.
- *
- * See arch/openrisc/kernel/process.c:
- * The args are passed as follows:
- *   arg1 (r3) : passed in r20
- *   arg2 (r4) : passed in r22
- */
-
-ENTRY(_kernel_thread_helper)
-	l.or	r3,r20,r0
-	l.or	r4,r22,r0
-	l.movhi	r31,hi(kernel_thread_helper)
-	l.ori	r31,r31,lo(kernel_thread_helper)
-	l.jr	r31
-	 l.nop
-
-
 /* ========================================================[ switch ] === */
 
 /*
@@ -1044,8 +1034,13 @@
 	/* Unwind stack to pre-switch state */
 	l.addi  r1,r1,(INT_FRAME_SIZE)
 
-	/* Return via the link-register back to where we 'came from', where that can be
-	 * either schedule() or return_from_fork()... */
+	/* Return via the link-register back to where we 'came from', where
+	 * that may be either schedule(), ret_from_fork(), or
+	 * ret_from_kernel_thread().  If we are returning to a new thread,
+	 * we are expected to have set up the arg to schedule_tail already,
+	 * hence we do so here unconditionally:
+	 */
+	l.lwz   r3,TI_STACK(r3)		/* Load 'prev' as schedule_tail arg */
 	l.jr	r9
 	 l.nop
 
@@ -1088,10 +1083,6 @@
 	l.j	_fork_save_extra_regs_and_call
 	 l.addi	r3,r1,0
 
-ENTRY(sys_execve)
-	l.j	_sys_execve
-	 l.addi	r6,r1,0
-
 ENTRY(sys_sigaltstack)
 	l.j	_sys_sigaltstack
 	 l.addi	r5,r1,0
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index c35f3ab..e0874b8 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -109,66 +109,82 @@
  */
 extern asmlinkage void ret_from_fork(void);
 
+/*
+ * copy_thread
+ * @clone_flags: flags
+ * @usp: user stack pointer or fn for kernel thread
+ * @arg: arg to fn for kernel thread; always NULL for userspace thread
+ * @p: the newly created task
+ * @regs: CPU context to copy for userspace thread; always NULL for kthread
+ *
+ * At the top of a newly initialized kernel stack are two stacked pt_reg
+ * structures.  The first (topmost) is the userspace context of the thread.
+ * The second is the kernelspace context of the thread.
+ *
+ * A kernel thread will not be returning to userspace, so the topmost pt_regs
+ * struct can be uninitialized; it _does_ need to exist, though, because
+ * a kernel thread can become a userspace thread by doing a kernel_execve, in
+ * which case the topmost context will be initialized and used for 'returning'
+ * to userspace.
+ *
+ * The second pt_reg struct needs to be initialized to 'return' to
+ * ret_from_fork.  A kernel thread will need to set r20 to the address of
+ * a function to call into (with arg in r22); userspace threads need to set
+ * r20 to NULL in which case ret_from_fork will just continue a return to
+ * userspace.
+ *
+ * A kernel thread 'fn' may return; this is effectively what happens when
+ * kernel_execve is called.  In that case, the userspace pt_regs must have
+ * been initialized (which kernel_execve takes care of, see start_thread
+ * below); ret_from_fork will then continue its execution causing the
+ * 'kernel thread' to return to userspace as a userspace thread.
+ */
+
 int
 copy_thread(unsigned long clone_flags, unsigned long usp,
-	    unsigned long unused, struct task_struct *p, struct pt_regs *regs)
+	    unsigned long arg, struct task_struct *p, struct pt_regs *regs)
 {
-	struct pt_regs *childregs;
+	struct pt_regs *userregs;
 	struct pt_regs *kregs;
 	unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
-	struct thread_info *ti;
 	unsigned long top_of_kernel_stack;
 
 	top_of_kernel_stack = sp;
 
 	p->set_child_tid = p->clear_child_tid = NULL;
 
-	/* Copy registers */
-	/* redzone */
-	sp -= STACK_FRAME_OVERHEAD;
+	/* Locate userspace context on stack... */
+	sp -= STACK_FRAME_OVERHEAD;	/* redzone */
 	sp -= sizeof(struct pt_regs);
-	childregs = (struct pt_regs *)sp;
+	userregs = (struct pt_regs *) sp;
 
-	/* Copy parent registers */
-	*childregs = *regs;
-
-	if ((childregs->sr & SPR_SR_SM) == 1) {
-		/* for kernel thread, set `current_thread_info'
-		 * and stackptr in new task
-		 */
-		childregs->sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
-		childregs->gpr[10] = (unsigned long)task_thread_info(p);
-	} else {
-		childregs->sp = usp;
-	}
-
-	childregs->gpr[11] = 0;	/* Result from fork() */
-
-	/*
-	 * The way this works is that at some point in the future
-	 * some task will call _switch to switch to the new task.
-	 * That will pop off the stack frame created below and start
-	 * the new task running at ret_from_fork.  The new task will
-	 * do some house keeping and then return from the fork or clone
-	 * system call, using the stack frame created above.
-	 */
-	/* redzone */
-	sp -= STACK_FRAME_OVERHEAD;
+	/* ...and kernel context */
+	sp -= STACK_FRAME_OVERHEAD;	/* redzone */
 	sp -= sizeof(struct pt_regs);
 	kregs = (struct pt_regs *)sp;
 
-	ti = task_thread_info(p);
-	ti->ksp = sp;
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		memset(kregs, 0, sizeof(struct pt_regs));
+		kregs->gpr[20] = usp; /* fn, kernel thread */
+		kregs->gpr[22] = arg;
+	} else {
+		*userregs = *regs;
 
-	/* kregs->sp must store the location of the 'pre-switch' kernel stack
-	 * pointer... for a newly forked process, this is simply the top of
-	 * the kernel stack.
+		userregs->sp = usp;
+		userregs->gpr[11] = 0;	/* Result from fork() */
+
+		kregs->gpr[20] = 0;	/* Userspace thread */
+	}
+
+	/*
+	 * _switch wants the kernel stack page in pt_regs->sp so that it
+	 * can restore it to thread_info->ksp... see _switch for details.
 	 */
 	kregs->sp = top_of_kernel_stack;
-	kregs->gpr[3] = (unsigned long)current;	/* arg to schedule_tail */
-	kregs->gpr[10] = (unsigned long)task_thread_info(p);
 	kregs->gpr[9] = (unsigned long)ret_from_fork;
 
+	task_thread_info(p)->ksp = (unsigned long)kregs;
+
 	return 0;
 }
 
@@ -177,16 +193,14 @@
  */
 void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
 {
-	unsigned long sr = regs->sr & ~SPR_SR_SM;
+	unsigned long sr = mfspr(SPR_SR) & ~SPR_SR_SM;
 
 	set_fs(USER_DS);
-	memset(regs->gpr, 0, sizeof(regs->gpr));
+	memset(regs, 0, sizeof(struct pt_regs));
 
 	regs->pc = pc;
 	regs->sr = sr;
 	regs->sp = sp;
-
-/*	printk("start thread, ksp = %lx\n", current_thread_info()->ksp);*/
 }
 
 /* Fill in the fpu structure for a core dump.  */
@@ -237,74 +251,9 @@
 	dest[35] = 0;
 }
 
-extern void _kernel_thread_helper(void);
-
-void __noreturn kernel_thread_helper(int (*fn) (void *), void *arg)
-{
-	do_exit(fn(arg));
-}
-
-/*
- * Create a kernel thread.
- */
-int kernel_thread(int (*fn) (void *), void *arg, unsigned long flags)
-{
-	struct pt_regs regs;
-
-	memset(&regs, 0, sizeof(regs));
-
-	regs.gpr[20] = (unsigned long)fn;
-	regs.gpr[22] = (unsigned long)arg;
-	regs.sr = mfspr(SPR_SR);
-	regs.pc = (unsigned long)_kernel_thread_helper;
-
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED,
-		       0, &regs, 0, NULL, NULL);
-}
-
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage long _sys_execve(const char __user *name,
-			    const char __user * const __user *argv,
-			    const char __user * const __user *envp,
-			    struct pt_regs *regs)
-{
-	int error;
-	struct filename *filename;
-
-	filename = getname(name);
-	error = PTR_ERR(filename);
-
-	if (IS_ERR(filename))
-		goto out;
-
-	error = do_execve(filename->name, argv, envp, regs);
-	putname(filename);
-
-out:
-	return error;
-}
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	/* TODO */
 
 	return 0;
 }
-
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
-{
-	register long __res asm("r11") = __NR_execve;
-	register long __a asm("r3") = (long)(filename);
-	register long __b asm("r4") = (long)(argv);
-	register long __c asm("r5") = (long)(envp);
-	__asm__ volatile ("l.sys 1"
-			  : "=r" (__res), "=r"(__a), "=r"(__b), "=r"(__c)
-			  : "0"(__res), "1"(__a), "2"(__b), "3"(__c)
-			  : "r6", "r7", "r8", "r12", "r13", "r15",
-			    "r17", "r19", "r21", "r23", "r25", "r27",
-			    "r29", "r31");
-	__asm__ volatile ("l.nop");
-	return __res;
-}
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 11def45..0aec70c 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -22,6 +22,8 @@
 	select GENERIC_STRNCPY_FROM_USER
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index 541639c..55512e2 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -163,6 +163,7 @@
 #define __ARCH_WANT_SYS_RT_SIGACTION
 #define __ARCH_WANT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_SYS_EXECVE
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 18670a0..c9a9abd 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -708,59 +708,9 @@
 	.import		do_cpu_irq_mask,code
 
 	/*
-	 * r26 = function to be called
-	 * r25 = argument to pass in
-	 * r24 = flags for do_fork()
-	 *
-	 * Kernel threads don't ever return, so they don't need
-	 * a true register context. We just save away the arguments
-	 * for copy_thread/ret_ to properly set up the child.
-	 */
-
-#define CLONE_VM 0x100	/* Must agree with <linux/sched.h> */
-#define CLONE_UNTRACED 0x00800000
-
-	.import do_fork
-ENTRY(__kernel_thread)
-	STREG	%r2, -RP_OFFSET(%r30)
-
-	copy	%r30, %r1
-	ldo	PT_SZ_ALGN(%r30),%r30
-#ifdef CONFIG_64BIT
-	/* Yo, function pointers in wide mode are little structs... -PB */
-	ldd	24(%r26), %r2
-	STREG	%r2, PT_GR27(%r1)	/* Store childs %dp */
-	ldd	16(%r26), %r26
-
-	STREG	%r22, PT_GR22(%r1)	/* save r22 (arg5) */
-	copy	%r0, %r22		/* user_tid */
-#endif
-	STREG	%r26, PT_GR26(%r1)  /* Store function & argument for child */
-	STREG	%r25, PT_GR25(%r1)
-	ldil	L%CLONE_UNTRACED, %r26
-	ldo	CLONE_VM(%r26), %r26   /* Force CLONE_VM since only init_mm */
-	or	%r26, %r24, %r26      /* will have kernel mappings.	 */
-	ldi	1, %r25			/* stack_start, signals kernel thread */
-	stw	%r0, -52(%r30)	     	/* user_tid */
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-#endif
-	BL	do_fork, %r2
-	copy	%r1, %r24		/* pt_regs */
-
-	/* Parent Returns here */
-
-	LDREG	-PT_SZ_ALGN-RP_OFFSET(%r30), %r2
-	ldo	-PT_SZ_ALGN(%r30), %r30
-	bv	%r0(%r2)
-	nop
-ENDPROC(__kernel_thread)
-
-	/*
 	 * Child Returns here
 	 *
-	 * copy_thread moved args from temp save area set up above
-	 * into task save area.
+	 * copy_thread moved args into task save area.
 	 */
 
 ENTRY(ret_from_kernel_thread)
@@ -769,51 +719,17 @@
 	BL	schedule_tail, %r2
 	nop
 
-	LDREG	TI_TASK-THREAD_SZ_ALGN(%r30), %r1
+	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30), %r1
 	LDREG	TASK_PT_GR25(%r1), %r26
 #ifdef CONFIG_64BIT
 	LDREG	TASK_PT_GR27(%r1), %r27
-	LDREG	TASK_PT_GR22(%r1), %r22
 #endif
 	LDREG	TASK_PT_GR26(%r1), %r1
 	ble	0(%sr7, %r1)
 	copy	%r31, %r2
-
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-	loadgp				/* Thread could have been in a module */
-#endif
-#ifndef CONFIG_64BIT
-	b	sys_exit
-#else
-	load32	sys_exit, %r1
-	bv	%r0(%r1)
-#endif
-	ldi	0, %r26
-ENDPROC(ret_from_kernel_thread)
-
-	.import	sys_execve, code
-ENTRY(__execve)
-	copy	%r2, %r15
-	copy	%r30, %r16
-	ldo	PT_SZ_ALGN(%r30), %r30
-	STREG	%r26, PT_GR26(%r16)
-	STREG	%r25, PT_GR25(%r16)
-	STREG	%r24, PT_GR24(%r16)
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-#endif
-	BL	sys_execve, %r2
-	copy	%r16, %r26
-
-	cmpib,=,n 0,%r28,intr_return    /* forward */
-
-	/* yes, this will trap and die. */
-	copy	%r15, %r2
-	copy	%r16, %r30
-	bv	%r0(%r2)
+	b	finish_child_return
 	nop
-ENDPROC(__execve)
+ENDPROC(ret_from_kernel_thread)
 
 
 	/*
@@ -1776,49 +1692,27 @@
 	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30), %r1
 	ldo	TASK_REGS(%r1),%r1
 	reg_save %r1
-	mfctl	%cr27, %r3
-	STREG	%r3, PT_CR27(%r1)
-
-	STREG	%r2,-RP_OFFSET(%r30)
-	ldo	FRAME_SIZE(%r30),%r30
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-#endif
-
-	/* These are call-clobbered registers and therefore
-	   also syscall-clobbered (we hope). */
-	STREG	%r2,PT_GR19(%r1)	/* save for child */
-	STREG	%r30,PT_GR21(%r1)
+	mfctl	%cr27, %r28
+	STREG	%r28, PT_CR27(%r1)
 
 	LDREG	PT_GR30(%r1),%r25
 	copy	%r1,%r24
-	BL	sys_clone,%r2
+	b	sys_clone
 	ldi	SIGCHLD,%r26
-
-	LDREG	-RP_OFFSET-FRAME_SIZE(%r30),%r2
-wrapper_exit:
-	ldo	-FRAME_SIZE(%r30),%r30		/* get the stackframe */
-	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r1
-	ldo	TASK_REGS(%r1),%r1	 /* get pt regs */
-
-	LDREG	PT_CR27(%r1), %r3
-	mtctl	%r3, %cr27
-	reg_restore %r1
-
-	/* strace expects syscall # to be preserved in r20 */
-	ldi	__NR_fork,%r20
-	bv %r0(%r2)
-	STREG	%r20,PT_GR20(%r1)
 ENDPROC(sys_fork_wrapper)
 
 	/* Set the return value for the child */
 ENTRY(child_return)
 	BL	schedule_tail, %r2
 	nop
+finish_child_return:
+	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30), %r1
+	ldo	TASK_REGS(%r1),%r1	 /* get pt regs */
 
-	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE-FRAME_SIZE(%r30), %r1
-	LDREG	TASK_PT_GR19(%r1),%r2
-	b	wrapper_exit
+	LDREG	PT_CR27(%r1), %r3
+	mtctl	%r3, %cr27
+	reg_restore %r1
+	b	syscall_exit
 	copy	%r0,%r28
 ENDPROC(child_return)
 
@@ -1827,23 +1721,10 @@
 	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r1
 	ldo	TASK_REGS(%r1),%r1	/* get pt regs */
 	reg_save %r1
-	mfctl	%cr27, %r3
-	STREG	%r3, PT_CR27(%r1)
-
-	STREG	%r2,-RP_OFFSET(%r30)
-	ldo	FRAME_SIZE(%r30),%r30
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-#endif
-
-	/* WARNING - Clobbers r19 and r21, userspace must save these! */
-	STREG	%r2,PT_GR19(%r1)	/* save for child */
-	STREG	%r30,PT_GR21(%r1)
-	BL	sys_clone,%r2
+	mfctl	%cr27, %r28
+	STREG	%r28, PT_CR27(%r1)
+	b	sys_clone
 	copy	%r1,%r24
-
-	b	wrapper_exit
-	LDREG	-RP_OFFSET-FRAME_SIZE(%r30),%r2
 ENDPROC(sys_clone_wrapper)
 
 
@@ -1851,72 +1732,14 @@
 	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r1
 	ldo	TASK_REGS(%r1),%r1	/* get pt regs */
 	reg_save %r1
-	mfctl	%cr27, %r3
-	STREG	%r3, PT_CR27(%r1)
+	mfctl	%cr27, %r28
+	STREG	%r28, PT_CR27(%r1)
 
-	STREG	%r2,-RP_OFFSET(%r30)
-	ldo	FRAME_SIZE(%r30),%r30
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-#endif
-
-	STREG	%r2,PT_GR19(%r1)	/* save for child */
-	STREG	%r30,PT_GR21(%r1)
-
-	BL	sys_vfork,%r2
+	b	sys_vfork
 	copy	%r1,%r26
-
-	b	wrapper_exit
-	LDREG	-RP_OFFSET-FRAME_SIZE(%r30),%r2
 ENDPROC(sys_vfork_wrapper)
 
 	
-	.macro  execve_wrapper execve
-	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r1
-	ldo	TASK_REGS(%r1),%r1	/* get pt regs */
-
-	/*
-	 * Do we need to save/restore r3-r18 here?
-	 * I don't think so. why would new thread need old
-	 * threads registers?
-	 */
-
-	/* %arg0 - %arg3 are already saved for us. */
-
-	STREG %r2,-RP_OFFSET(%r30)
-	ldo FRAME_SIZE(%r30),%r30
-#ifdef CONFIG_64BIT
-	ldo	-16(%r30),%r29		/* Reference param save area */
-#endif
-	BL \execve,%r2
-	copy %r1,%arg0
-
-	ldo -FRAME_SIZE(%r30),%r30
-	LDREG -RP_OFFSET(%r30),%r2
-
-	/* If exec succeeded we need to load the args */
-
-	ldo -1024(%r0),%r1
-	cmpb,>>= %r28,%r1,error_\execve
-	copy %r2,%r19
-
-error_\execve:
-	bv %r0(%r19)
-	nop
-	.endm
-
-	.import sys_execve
-ENTRY(sys_execve_wrapper)
-	execve_wrapper sys_execve
-ENDPROC(sys_execve_wrapper)
-
-#ifdef CONFIG_64BIT
-	.import sys32_execve
-ENTRY(sys32_execve_wrapper)
-	execve_wrapper sys32_execve
-ENDPROC(sys32_execve_wrapper)
-#endif
-
 ENTRY(sys_rt_sigreturn_wrapper)
 	LDREG	TI_TASK-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r26
 	ldo	TASK_REGS(%r26),%r26	/* get pt regs */
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index cbc3721..38db36f 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -52,6 +52,7 @@
 
 #include <asm/io.h>
 #include <asm/asm-offsets.h>
+#include <asm/assembly.h>
 #include <asm/pdc.h>
 #include <asm/pdc_chassis.h>
 #include <asm/pgalloc.h>
@@ -165,23 +166,6 @@
 EXPORT_SYMBOL(pm_power_off);
 
 /*
- * Create a kernel thread
- */
-
-extern pid_t __kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
-
-	/*
-	 * FIXME: Once we are sure we don't need any debug here,
-	 *	  kernel_thread can become a #define.
-	 */
-
-	return __kernel_thread(fn, arg, flags);
-}
-EXPORT_SYMBOL(kernel_thread);
-
-/*
  * Free current thread data structures etc..
  */
 void exit_thread(void)
@@ -256,8 +240,8 @@
 
 int
 copy_thread(unsigned long clone_flags, unsigned long usp,
-	    unsigned long unused,	/* in ia64 this is "user_stack_size" */
-	    struct task_struct * p, struct pt_regs * pregs)
+	    unsigned long arg,
+	    struct task_struct *p, struct pt_regs *pregs)
 {
 	struct pt_regs * cregs = &(p->thread.regs);
 	void *stack = task_stack_page(p);
@@ -270,48 +254,32 @@
 #ifdef CONFIG_HPUX
 	extern void * const hpux_child_return;
 #endif
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		memset(cregs, 0, sizeof(struct pt_regs));
+		if (!usp) /* idle thread */
+			return 0;
 
-	*cregs = *pregs;
-
-	/* Set the return value for the child.  Note that this is not
-           actually restored by the syscall exit path, but we put it
-           here for consistency in case of signals. */
-	cregs->gr[28] = 0; /* child */
-
-	/*
-	 * We need to differentiate between a user fork and a
-	 * kernel fork. We can't use user_mode, because the
-	 * the syscall path doesn't save iaoq. Right now
-	 * We rely on the fact that kernel_thread passes
-	 * in zero for usp.
-	 */
-	if (usp == 1) {
 		/* kernel thread */
-		cregs->ksp = (unsigned long)stack + THREAD_SZ_ALGN;
 		/* Must exit via ret_from_kernel_thread in order
 		 * to call schedule_tail()
 		 */
+		cregs->ksp = (unsigned long)stack + THREAD_SZ_ALGN + FRAME_SIZE;
 		cregs->kpc = (unsigned long) &ret_from_kernel_thread;
 		/*
 		 * Copy function and argument to be called from
 		 * ret_from_kernel_thread.
 		 */
 #ifdef CONFIG_64BIT
-		cregs->gr[27] = pregs->gr[27];
+		cregs->gr[27] = ((unsigned long *)usp)[3];
+		cregs->gr[26] = ((unsigned long *)usp)[2];
+#else
+		cregs->gr[26] = usp;
 #endif
-		cregs->gr[26] = pregs->gr[26];
-		cregs->gr[25] = pregs->gr[25];
+		cregs->gr[25] = arg;
 	} else {
 		/* user thread */
-		/*
-		 * Note that the fork wrappers are responsible
-		 * for setting gr[21].
-		 */
-
-		/* Use same stack depth as parent */
-		cregs->ksp = (unsigned long)stack
-			+ (pregs->gr[21] & (THREAD_SIZE - 1));
 		cregs->gr[30] = usp;
+		cregs->ksp = (unsigned long)stack + THREAD_SZ_ALGN + FRAME_SIZE;
 		if (personality(p->personality) == PER_HPUX) {
 #ifdef CONFIG_HPUX
 			cregs->kpc = (unsigned long) &hpux_child_return;
@@ -323,8 +291,7 @@
 		}
 		/* Setup thread TLS area from the 4th parameter in clone */
 		if (clone_flags & CLONE_SETTLS)
-		  cregs->cr27 = pregs->gr[23];
-	
+			cregs->cr27 = pregs->gr[23];
 	}
 
 	return 0;
@@ -335,39 +302,6 @@
 	return t->thread.regs.kpc;
 }
 
-/*
- * sys_execve() executes a new program.
- */
-
-asmlinkage int sys_execve(struct pt_regs *regs)
-{
-	int error;
-	struct filename *filename;
-
-	filename = getname((const char __user *) regs->gr[26]);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = do_execve(filename->name,
-			  (const char __user *const __user *) regs->gr[25],
-			  (const char __user *const __user *) regs->gr[24],
-			  regs);
-	putname(filename);
-out:
-
-	return error;
-}
-
-extern int __execve(const char *filename,
-		    const char *const argv[],
-		    const char *const envp[], struct task_struct *task);
-int kernel_execve(const char *filename,
-		  const char *const argv[],
-		  const char *const envp[])
-{
-	return __execve(filename, argv, envp, current);
-}
-
 unsigned long
 get_wchan(struct task_struct *p)
 {
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index bf5b93a..9cfdaa1 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -53,28 +53,6 @@
 #define DBG(x)
 #endif
 
-/*
- * sys32_execve() executes a new program.
- */
-
-asmlinkage int sys32_execve(struct pt_regs *regs)
-{
-	int error;
-	struct filename *filename;
-
-	DBG(("sys32_execve(%p) r26 = 0x%lx\n", regs, regs->gr[26]));
-	filename = getname((const char __user *) regs->gr[26]);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = compat_do_execve(filename->name, compat_ptr(regs->gr[25]),
-				 compat_ptr(regs->gr[24]), regs);
-	putname(filename);
-out:
-
-	return error;
-}
-
 asmlinkage long sys32_unimplemented(int r26, int r25, int r24, int r23,
 	int r22, int r21, int r20)
 {
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index 3735abd..cb2da96 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -66,7 +66,7 @@
 	ENTRY_SAME(creat)
 	ENTRY_SAME(link)
 	ENTRY_SAME(unlink)		/* 10 */
-	ENTRY_DIFF(execve_wrapper)
+	ENTRY_COMP(execve)
 	ENTRY_SAME(chdir)
 	/* See comments in kernel/time.c!!! Maybe we don't need this? */
 	ENTRY_COMP(time)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5dba755..af431de 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -137,6 +137,7 @@
 	select KTIME_SCALAR if 32BIT
 	select HAVE_ARCH_SECCOMP_FILTER
 	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index bbbae41..ccbcab7 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -54,7 +54,6 @@
 #   define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 # endif
 #define __ARCH_WANT_SYS_EXECVE
-#define __ARCH_WANT_KERNEL_EXECVE
 
 /*
  * "Conditional" syscalls
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index ef46f66..aa8f2ba 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -330,40 +330,18 @@
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	l	%r12,__LC_THREAD_INFO
 	l	%r13,__LC_SVC_NEW_PSW+4
+	l	%r1,BASED(.Lschedule_tail)
+	basr	%r14,%r1		# call schedule_tail
+	TRACE_IRQS_ON
+	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
 	tm	__PT_PSW+1(%r11),0x01	# forking a kernel thread ?
-	je	1f
-	l	%r1,BASED(.Lschedule_tail)
-	basr	%r14,%r1		# call schedule_tail
-	TRACE_IRQS_ON
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	j	sysc_tracenogo
-
-1:	# it's a kernel thread
-	st	%r15,__PT_R15(%r11)	# store stack pointer for new kthread
-	l	%r1,BASED(.Lschedule_tail)
-	basr	%r14,%r1		# call schedule_tail
-	TRACE_IRQS_ON
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	lm	%r9,%r11,__PT_R9(%r11)	# load gprs
+	jne	sysc_tracenogo
+	# it's a kernel thread
+	lm	%r9,%r10,__PT_R9(%r11)	# load gprs
 ENTRY(kernel_thread_starter)
 	la	%r2,0(%r10)
 	basr	%r14,%r9
-	la	%r2,0
-	br	%r11			# do_exit
-
-#
-# kernel_execve function needs to deal with pt_regs that is not
-# at the usual place
-#
-ENTRY(ret_from_kernel_execve)
-	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
-	lr	%r15,%r2
-	lr	%r11,%r2
-	ahi	%r15,-STACK_FRAME_OVERHEAD
-	xc	__SF_BACKCHAIN(4,%r15),__SF_BACKCHAIN(%r15)
-	l	%r12,__LC_THREAD_INFO
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	j	sysc_return
+	j	sysc_tracenogo
 
 /*
  * Program check handler routine
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 07d8de3..499e95e 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -352,33 +352,17 @@
 ENTRY(ret_from_fork)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	lg	%r12,__LC_THREAD_INFO
+	brasl	%r14,schedule_tail
+	TRACE_IRQS_ON
+	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
 	tm	__PT_PSW+1(%r11),0x01	# forking a kernel thread ?
-	je	1f
-	brasl	%r14,schedule_tail
-	TRACE_IRQS_ON
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	j	sysc_tracenogo
-1:	# it's a kernel thread
-	stg	%r15,__PT_R15(%r11)	# store stack pointer for new kthread
-	brasl	%r14,schedule_tail
-	TRACE_IRQS_ON
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	lmg	%r9,%r11,__PT_R9(%r11)	# load gprs
+	jne	sysc_tracenogo
+	# it's a kernel thread
+	lmg	%r9,%r10,__PT_R9(%r11)	# load gprs
 ENTRY(kernel_thread_starter)
 	la	%r2,0(%r10)
 	basr	%r14,%r9
-	la	%r2,0
-	br	%r11			# do_exit
-
-ENTRY(ret_from_kernel_execve)
-	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
-	lgr	%r15,%r2
-	lgr	%r11,%r2
-	aghi	%r15,-STACK_FRAME_OVERHEAD
-	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-	lg	%r12,__LC_THREAD_INFO
-	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
-	j	sysc_return
+	j	sysc_tracenogo
 
 /*
  * Program check handler routine
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index b6b442b..e52f3c2 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -40,6 +40,8 @@
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select MODULES_USE_ELF_RELA
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h
index f74ac9e..c1e0191 100644
--- a/arch/sparc/include/asm/processor_32.h
+++ b/arch/sparc/include/asm/processor_32.h
@@ -106,7 +106,6 @@
 
 /* Free all resources held by a thread. */
 #define release_thread(tsk)		do { } while(0)
-extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long get_wchan(struct task_struct *);
 
diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h
index 4e5a483..0305d56 100644
--- a/arch/sparc/include/asm/processor_64.h
+++ b/arch/sparc/include/asm/processor_64.h
@@ -94,6 +94,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <asm/fpumacro.h>
 
 /* Return saved PC of a blocked thread. */
 struct task_struct;
@@ -143,6 +144,10 @@
 	: \
 	: "r" (regs), "r" (sp - sizeof(struct reg_window) - STACK_BIAS), \
 	  "i" ((const unsigned long)(&((struct pt_regs *)0)->u_regs[0]))); \
+	fprs_write(0);	\
+	current_thread_info()->xfsr[0] = 0;	\
+	current_thread_info()->fpsaved[0] = 0;	\
+	regs->tstate &= ~TSTATE_PEF;	\
 } while (0)
 
 #define start_thread32(regs, pc, sp) \
@@ -183,13 +188,15 @@
 	: \
 	: "r" (regs), "r" (sp - sizeof(struct reg_window32)), \
 	  "i" ((const unsigned long)(&((struct pt_regs *)0)->u_regs[0]))); \
+	fprs_write(0);	\
+	current_thread_info()->xfsr[0] = 0;	\
+	current_thread_info()->fpsaved[0] = 0;	\
+	regs->tstate &= ~TSTATE_PEF;	\
 } while (0)
 
 /* Free all resources held by a thread. */
 #define release_thread(tsk)		do { } while (0)
 
-extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
-
 extern unsigned long get_wchan(struct task_struct *task);
 
 #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs)
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index da43bdc..bdfafd7 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -32,6 +32,9 @@
 #define arch_ptrace_stop(exit_code, info) \
 	synchronize_user_stack()
 
+#define current_pt_regs() \
+	((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1)
+
 struct global_reg_snapshot {
 	unsigned long		tstate;
 	unsigned long		tpc;
@@ -55,9 +58,7 @@
 
 extern union global_cpu_snapshot global_cpu_snapshot[NR_CPUS];
 
-#define force_successful_syscall_return()	    \
-do {	current_thread_info()->syscall_noerror = 1; \
-} while (0)
+#define force_successful_syscall_return() set_thread_noerror(1)
 #define user_mode(regs) (!((regs)->tstate & TSTATE_PRIV))
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
@@ -100,6 +101,9 @@
 #define arch_ptrace_stop(exit_code, info) \
 	synchronize_user_stack()
 
+#define current_pt_regs() \
+	((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1)
+
 #define user_mode(regs) (!((regs)->psr & PSR_PS))
 #define instruction_pointer(regs) ((regs)->pc)
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
diff --git a/arch/sparc/include/asm/switch_to_64.h b/arch/sparc/include/asm/switch_to_64.h
index 7923c4a..cad36f5 100644
--- a/arch/sparc/include/asm/switch_to_64.h
+++ b/arch/sparc/include/asm/switch_to_64.h
@@ -23,7 +23,7 @@
 	/* If you are tempted to conditionalize the following */	\
 	/* so that ASI is only written if it changes, think again. */	\
 	__asm__ __volatile__("wr %%g0, %0, %%asi"			\
-	: : "r" (__thread_flag_byte_ptr(task_thread_info(next))[TI_FLAG_BYTE_CURRENT_DS]));\
+	: : "r" (task_thread_info(next)->current_ds));\
 	trap_block[current_thread_info()->cpu].thread =			\
 		task_thread_info(next);					\
 	__asm__ __volatile__(						\
diff --git a/arch/sparc/include/asm/syscalls.h b/arch/sparc/include/asm/syscalls.h
index 45a43f6..bf8972a 100644
--- a/arch/sparc/include/asm/syscalls.h
+++ b/arch/sparc/include/asm/syscalls.h
@@ -8,6 +8,4 @@
 				     struct pt_regs *regs,
 				     unsigned long stack_size);
 
-extern asmlinkage int sparc_execve(struct pt_regs *regs);
-
 #endif /* _SPARC64_SYSCALLS_H */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 4e227663..ea2ba22 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -14,12 +14,12 @@
 #define TI_FLAG_FAULT_CODE_SHIFT	56
 #define TI_FLAG_BYTE_WSTATE		1
 #define TI_FLAG_WSTATE_SHIFT		48
-#define TI_FLAG_BYTE_CWP		2
-#define TI_FLAG_CWP_SHIFT		40
-#define TI_FLAG_BYTE_CURRENT_DS		3
-#define TI_FLAG_CURRENT_DS_SHIFT	32
-#define TI_FLAG_BYTE_FPDEPTH		4
-#define TI_FLAG_FPDEPTH_SHIFT		24
+#define TI_FLAG_BYTE_NOERROR		2
+#define TI_FLAG_BYTE_NOERROR_SHIFT	40
+#define TI_FLAG_BYTE_FPDEPTH		3
+#define TI_FLAG_FPDEPTH_SHIFT		32
+#define TI_FLAG_BYTE_CWP		4
+#define TI_FLAG_CWP_SHIFT		24
 #define TI_FLAG_BYTE_WSAVED		5
 #define TI_FLAG_WSAVED_SHIFT		16
 
@@ -47,7 +47,7 @@
 	struct exec_domain	*exec_domain;
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u8			new_child;
-	__u8			syscall_noerror;
+	__u8			current_ds;
 	__u16			cpu;
 
 	unsigned long		*utraps;
@@ -74,9 +74,9 @@
 #define TI_FAULT_CODE	(TI_FLAGS + TI_FLAG_BYTE_FAULT_CODE)
 #define TI_WSTATE	(TI_FLAGS + TI_FLAG_BYTE_WSTATE)
 #define TI_CWP		(TI_FLAGS + TI_FLAG_BYTE_CWP)
-#define TI_CURRENT_DS	(TI_FLAGS + TI_FLAG_BYTE_CURRENT_DS)
 #define TI_FPDEPTH	(TI_FLAGS + TI_FLAG_BYTE_FPDEPTH)
 #define TI_WSAVED	(TI_FLAGS + TI_FLAG_BYTE_WSAVED)
+#define TI_SYS_NOERROR	(TI_FLAGS + TI_FLAG_BYTE_NOERROR)
 #define TI_FPSAVED	0x00000010
 #define TI_KSP		0x00000018
 #define TI_FAULT_ADDR	0x00000020
@@ -84,7 +84,7 @@
 #define TI_EXEC_DOMAIN	0x00000030
 #define TI_PRE_COUNT	0x00000038
 #define TI_NEW_CHILD	0x0000003c
-#define TI_SYS_NOERROR	0x0000003d
+#define TI_CURRENT_DS	0x0000003d
 #define TI_CPU		0x0000003e
 #define TI_UTRAPS	0x00000040
 #define TI_REG_WINDOW	0x00000048
@@ -121,7 +121,7 @@
 #define INIT_THREAD_INFO(tsk)				\
 {							\
 	.task		=	&tsk,			\
-	.flags		= ((unsigned long)ASI_P) << TI_FLAG_CURRENT_DS_SHIFT,	\
+	.current_ds	=	ASI_P,			\
 	.exec_domain	=	&default_exec_domain,	\
 	.preempt_count	=	INIT_PREEMPT_COUNT,	\
 	.restart_block	= {				\
@@ -153,13 +153,12 @@
 #define set_thread_wstate(val)		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_WSTATE] = (val))
 #define get_thread_cwp()		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_CWP])
 #define set_thread_cwp(val)		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_CWP] = (val))
-#define get_thread_current_ds()		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_CURRENT_DS])
-#define set_thread_current_ds(val)	(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_CURRENT_DS] = (val))
+#define get_thread_noerror()		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_NOERROR])
+#define set_thread_noerror(val)		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_NOERROR] = (val))
 #define get_thread_fpdepth()		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_FPDEPTH])
 #define set_thread_fpdepth(val)		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_FPDEPTH] = (val))
 #define get_thread_wsaved()		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_WSAVED])
 #define set_thread_wsaved(val)		(__cur_thread_flag_byte_ptr[TI_FLAG_BYTE_WSAVED] = (val))
-
 #endif /* !(__ASSEMBLY__) */
 
 /*
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 73083e1..e562d3c 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -38,14 +38,14 @@
 #define VERIFY_READ	0
 #define VERIFY_WRITE	1
 
-#define get_fs() ((mm_segment_t) { get_thread_current_ds() })
+#define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)})
 #define get_ds() (KERNEL_DS)
 
 #define segment_eq(a,b)  ((a).seg == (b).seg)
 
 #define set_fs(val)								\
 do {										\
-	set_thread_current_ds((val).seg);					\
+	current_thread_info()->current_ds =(val).seg;				\
 	__asm__ __volatile__ ("wr %%g0, %0, %%asi" : : "r" ((val).seg));	\
 } while(0)
 
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index 0ecea6e..c3e5d8b 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -46,6 +46,7 @@
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
 #endif
+#define __ARCH_WANT_SYS_EXECVE
 
 /*
  * "Conditional" syscalls
diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S
index dcaa1cf..21fd1a8 100644
--- a/arch/sparc/kernel/entry.S
+++ b/arch/sparc/kernel/entry.S
@@ -806,23 +806,10 @@
 	call	c_sys_nis_syscall
 	 mov	%l5, %o7
 
-	.align	4
-	.globl	sys_execve
-sys_execve:
-	mov	%o7, %l5
-	add	%sp, STACKFRAME_SZ, %o0		! pt_regs *regs arg
-	call	sparc_execve
-	 mov	%l5, %o7
-
-	.globl	sunos_execv
 sunos_execv:
-	st	%g0, [%sp + STACKFRAME_SZ + PT_I2]
-
-	call	sparc_execve
-	 add	%sp, STACKFRAME_SZ, %o0
-
-	b	ret_sys_call
-	 ld	[%sp + STACKFRAME_SZ + PT_I0], %o0
+	.globl	sunos_execv
+	b	sys_execve
+	 clr	%i2
 
 	.align	4
 	.globl	sys_sparc_pipe
@@ -959,17 +946,9 @@
         .align  4
 linux_sparc_ni_syscall:
 	sethi   %hi(sys_ni_syscall), %l7
-	b       syscall_is_too_hard
+	b       do_syscall
 	 or     %l7, %lo(sys_ni_syscall), %l7
 
-linux_fast_syscall:
-	andn	%l7, 3, %l7
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov 	%i2, %o2
-	jmpl	%l7 + %g0, %g0
-	 mov	%i3, %o3
-
 linux_syscall_trace:
 	add	%sp, STACKFRAME_SZ, %o0
 	call	syscall_trace
@@ -991,6 +970,23 @@
 	b	ret_sys_call
 	 ld	[%sp + STACKFRAME_SZ + PT_I0], %o0
 
+	.globl	ret_from_kernel_thread
+ret_from_kernel_thread:
+	call	schedule_tail
+	 ld	[%g3 + TI_TASK], %o0
+	ld	[%sp + STACKFRAME_SZ + PT_G1], %l0
+	call	%l0
+	 ld	[%sp + STACKFRAME_SZ + PT_G2], %o0
+	rd	%psr, %l1
+	ld	[%sp + STACKFRAME_SZ + PT_PSR], %l0
+	andn	%l0, PSR_CWP, %l0
+	nop
+	and	%l1, PSR_CWP, %l1
+	or	%l0, %l1, %l0
+	st	%l0, [%sp + STACKFRAME_SZ + PT_PSR]
+	b	ret_sys_call
+	 mov	0, %o0
+
 	/* Linux native system calls enter here... */
 	.align	4
 	.globl	linux_sparc_syscall
@@ -1002,11 +998,8 @@
 	bgeu	linux_sparc_ni_syscall
 	 sll	%g1, 2, %l4
 	ld	[%l7 + %l4], %l7
-	andcc	%l7, 1, %g0
-	bne	linux_fast_syscall
-	 /* Just do first insn from SAVE_ALL in the delay slot */
 
-syscall_is_too_hard:
+do_syscall:
 	SAVE_ALL_HEAD
 	 rd	%wim, %l3
 
diff --git a/arch/sparc/kernel/etrap_64.S b/arch/sparc/kernel/etrap_64.S
index 786b185..1276ca2 100644
--- a/arch/sparc/kernel/etrap_64.S
+++ b/arch/sparc/kernel/etrap_64.S
@@ -92,8 +92,10 @@
 		rdpr	%wstate, %g2
 		wrpr	%g0, 0, %canrestore
 		sll	%g2, 3, %g2
+
+		/* Set TI_SYS_FPDEPTH to 1 and clear TI_SYS_NOERROR.  */
 		mov	1, %l5
-		stb	%l5, [%l6 + TI_FPDEPTH]
+		sth	%l5, [%l6 + TI_SYS_NOERROR]
 
 		wrpr	%g3, 0, %otherwin
 		wrpr	%g2, 0, %wstate
@@ -152,7 +154,9 @@
 		add	%l6, TI_FPSAVED + 1, %l4
 		srl	%l5, 1, %l3
 		add	%l5, 2, %l5
-		stb	%l5, [%l6 + TI_FPDEPTH]
+
+		/* Set TI_SYS_FPDEPTH to %l5 and clear TI_SYS_NOERROR.  */
+		sth	%l5, [%l6 + TI_SYS_NOERROR]
 		ba,pt	%xcc, 2b
 		 stb	%g0, [%l4 + %l3]
 		nop
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 487bffb..bf4c6ad 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -316,9 +316,10 @@
  * XXX See comment above sys_vfork in sparc64. todo.
  */
 extern void ret_from_fork(void);
+extern void ret_from_kernel_thread(void);
 
 int copy_thread(unsigned long clone_flags, unsigned long sp,
-		unsigned long unused,
+		unsigned long arg,
 		struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
@@ -336,16 +337,13 @@
 	}
 
 	/*
-	 *  p->thread_info         new_stack   childregs
-	 *  !                      !           !             {if(PSR_PS) }
-	 *  V                      V (stk.fr.) V  (pt_regs)  { (stk.fr.) }
-	 *  +----- - - - - - ------+===========+============={+==========}+
+	 *  p->thread_info         new_stack   childregs stack bottom
+	 *  !                      !           !             !
+	 *  V                      V (stk.fr.) V  (pt_regs)  V
+	 *  +----- - - - - - ------+===========+=============+
 	 */
 	new_stack = task_stack_page(p) + THREAD_SIZE;
-	if (regs->psr & PSR_PS)
-		new_stack -= STACKFRAME_SZ;
 	new_stack -= STACKFRAME_SZ + TRACEREG_SZ;
-	memcpy(new_stack, (char *)regs - STACKFRAME_SZ, STACKFRAME_SZ + TRACEREG_SZ);
 	childregs = (struct pt_regs *) (new_stack + STACKFRAME_SZ);
 
 	/*
@@ -356,55 +354,58 @@
 	 * Thus, kpsr|=PSR_PIL.
 	 */
 	ti->ksp = (unsigned long) new_stack;
+	p->thread.kregs = childregs;
+
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		extern int nwindows;
+		unsigned long psr;
+		memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ);
+		p->thread.flags |= SPARC_FLAG_KTHREAD;
+		p->thread.current_ds = KERNEL_DS;
+		ti->kpc = (((unsigned long) ret_from_kernel_thread) - 0x8);
+		childregs->u_regs[UREG_G1] = sp; /* function */
+		childregs->u_regs[UREG_G2] = arg;
+		psr = childregs->psr = get_psr();
+		ti->kpsr = psr | PSR_PIL;
+		ti->kwim = 1 << (((psr & PSR_CWP) + 1) % nwindows);
+		return 0;
+	}
+	memcpy(new_stack, (char *)regs - STACKFRAME_SZ, STACKFRAME_SZ + TRACEREG_SZ);
+	childregs->u_regs[UREG_FP] = sp;
+	p->thread.flags &= ~SPARC_FLAG_KTHREAD;
+	p->thread.current_ds = USER_DS;
 	ti->kpc = (((unsigned long) ret_from_fork) - 0x8);
 	ti->kpsr = current->thread.fork_kpsr | PSR_PIL;
 	ti->kwim = current->thread.fork_kwim;
 
-	if(regs->psr & PSR_PS) {
-		extern struct pt_regs fake_swapper_regs;
+	if (sp != regs->u_regs[UREG_FP]) {
+		struct sparc_stackf __user *childstack;
+		struct sparc_stackf __user *parentstack;
 
-		p->thread.kregs = &fake_swapper_regs;
-		new_stack += STACKFRAME_SZ + TRACEREG_SZ;
-		childregs->u_regs[UREG_FP] = (unsigned long) new_stack;
-		p->thread.flags |= SPARC_FLAG_KTHREAD;
-		p->thread.current_ds = KERNEL_DS;
-		memcpy(new_stack, (void *)regs->u_regs[UREG_FP], STACKFRAME_SZ);
-		childregs->u_regs[UREG_G6] = (unsigned long) ti;
-	} else {
-		p->thread.kregs = childregs;
-		childregs->u_regs[UREG_FP] = sp;
-		p->thread.flags &= ~SPARC_FLAG_KTHREAD;
-		p->thread.current_ds = USER_DS;
-
-		if (sp != regs->u_regs[UREG_FP]) {
-			struct sparc_stackf __user *childstack;
-			struct sparc_stackf __user *parentstack;
-
-			/*
-			 * This is a clone() call with supplied user stack.
-			 * Set some valid stack frames to give to the child.
-			 */
-			childstack = (struct sparc_stackf __user *)
-				(sp & ~0xfUL);
-			parentstack = (struct sparc_stackf __user *)
-				regs->u_regs[UREG_FP];
+		/*
+		 * This is a clone() call with supplied user stack.
+		 * Set some valid stack frames to give to the child.
+		 */
+		childstack = (struct sparc_stackf __user *)
+			(sp & ~0xfUL);
+		parentstack = (struct sparc_stackf __user *)
+			regs->u_regs[UREG_FP];
 
 #if 0
-			printk("clone: parent stack:\n");
-			show_stackframe(parentstack);
+		printk("clone: parent stack:\n");
+		show_stackframe(parentstack);
 #endif
 
-			childstack = clone_stackframe(childstack, parentstack);
-			if (!childstack)
-				return -EFAULT;
+		childstack = clone_stackframe(childstack, parentstack);
+		if (!childstack)
+			return -EFAULT;
 
 #if 0
-			printk("clone: child stack:\n");
-			show_stackframe(childstack);
+		printk("clone: child stack:\n");
+		show_stackframe(childstack);
 #endif
 
-			childregs->u_regs[UREG_FP] = (unsigned long)childstack;
-		}
+		childregs->u_regs[UREG_FP] = (unsigned long)childstack;
 	}
 
 #ifdef CONFIG_SMP
@@ -475,69 +476,6 @@
 	return 1;
 }
 
-/*
- * sparc_execve() executes a new program after the asm stub has set
- * things up for us.  This should basically do what I want it to.
- */
-asmlinkage int sparc_execve(struct pt_regs *regs)
-{
-	int error, base = 0;
-	struct filename *filename;
-
-	/* Check for indirect call. */
-	if(regs->u_regs[UREG_G1] == 0)
-		base = 1;
-
-	filename = getname((char __user *)regs->u_regs[base + UREG_I0]);
-	error = PTR_ERR(filename);
-	if(IS_ERR(filename))
-		goto out;
-	error = do_execve(filename->name,
-			  (const char __user *const  __user *)
-			  regs->u_regs[base + UREG_I1],
-			  (const char __user *const  __user *)
-			  regs->u_regs[base + UREG_I2],
-			  regs);
-	putname(filename);
-out:
-	return error;
-}
-
-/*
- * This is the mechanism for creating a new kernel thread.
- *
- * NOTE! Only a kernel-only process(ie the swapper or direct descendants
- * who haven't done an "execve()") should use this: it will work within
- * a system call from a "real" process, but the process memory space will
- * not be freed until both the parent and the child have exited.
- */
-pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
-{
-	long retval;
-
-	__asm__ __volatile__("mov %4, %%g2\n\t"    /* Set aside fn ptr... */
-			     "mov %5, %%g3\n\t"    /* and arg. */
-			     "mov %1, %%g1\n\t"
-			     "mov %2, %%o0\n\t"    /* Clone flags. */
-			     "mov 0, %%o1\n\t"     /* usp arg == 0 */
-			     "t 0x10\n\t"          /* Linux/Sparc clone(). */
-			     "cmp %%o1, 0\n\t"
-			     "be 1f\n\t"           /* The parent, just return. */
-			     " nop\n\t"            /* Delay slot. */
-			     "jmpl %%g2, %%o7\n\t" /* Call the function. */
-			     " mov %%g3, %%o0\n\t" /* Get back the arg in delay. */
-			     "mov %3, %%g1\n\t"
-			     "t 0x10\n\t"          /* Linux/Sparc exit(). */
-			     /* Notreached by child. */
-			     "1: mov %%o0, %0\n\t" :
-			     "=r" (retval) :
-			     "i" (__NR_clone), "r" (flags | CLONE_VM | CLONE_UNTRACED),
-			     "i" (__NR_exit),  "r" (fn), "r" (arg) :
-			     "g1", "g2", "g3", "o0", "o1", "memory", "cc");
-	return retval;
-}
-EXPORT_SYMBOL(kernel_thread);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index d778248..db5b46a 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -618,65 +618,56 @@
  * Child  -->  %o0 == parents pid, %o1 == 1
  */
 int copy_thread(unsigned long clone_flags, unsigned long sp,
-		unsigned long unused,
+		unsigned long arg,
 		struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *t = task_thread_info(p);
 	struct sparc_stackf *parent_sf;
 	unsigned long child_stack_sz;
 	char *child_trap_frame;
-	int kernel_thread;
-
-	kernel_thread = (regs->tstate & TSTATE_PRIV) ? 1 : 0;
-	parent_sf = ((struct sparc_stackf *) regs) - 1;
 
 	/* Calculate offset to stack_frame & pt_regs */
-	child_stack_sz = ((STACKFRAME_SZ + TRACEREG_SZ) +
-			  (kernel_thread ? STACKFRAME_SZ : 0));
+	child_stack_sz = (STACKFRAME_SZ + TRACEREG_SZ);
 	child_trap_frame = (task_stack_page(p) +
 			    (THREAD_SIZE - child_stack_sz));
-	memcpy(child_trap_frame, parent_sf, child_stack_sz);
 
-	t->flags = (t->flags & ~((0xffUL << TI_FLAG_CWP_SHIFT) |
-				 (0xffUL << TI_FLAG_CURRENT_DS_SHIFT))) |
-		(((regs->tstate + 1) & TSTATE_CWP) << TI_FLAG_CWP_SHIFT);
 	t->new_child = 1;
 	t->ksp = ((unsigned long) child_trap_frame) - STACK_BIAS;
 	t->kregs = (struct pt_regs *) (child_trap_frame +
 				       sizeof(struct sparc_stackf));
 	t->fpsaved[0] = 0;
 
-	if (kernel_thread) {
-		struct sparc_stackf *child_sf = (struct sparc_stackf *)
-			(child_trap_frame + (STACKFRAME_SZ + TRACEREG_SZ));
-
-		/* Zero terminate the stack backtrace.  */
-		child_sf->fp = NULL;
-		t->kregs->u_regs[UREG_FP] =
-		  ((unsigned long) child_sf) - STACK_BIAS;
-
-		t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT);
-		t->kregs->u_regs[UREG_G6] = (unsigned long) t;
-		t->kregs->u_regs[UREG_G4] = (unsigned long) t->task;
-	} else {
-		if (t->flags & _TIF_32BIT) {
-			sp &= 0x00000000ffffffffUL;
-			regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
-		}
-		t->kregs->u_regs[UREG_FP] = sp;
-		t->flags |= ((long)ASI_AIUS << TI_FLAG_CURRENT_DS_SHIFT);
-		if (sp != regs->u_regs[UREG_FP]) {
-			unsigned long csp;
-
-			csp = clone_stackframe(sp, regs->u_regs[UREG_FP]);
-			if (!csp)
-				return -EFAULT;
-			t->kregs->u_regs[UREG_FP] = csp;
-		}
-		if (t->utraps)
-			t->utraps[0]++;
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		memset(child_trap_frame, 0, child_stack_sz);
+		__thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] = 
+			(current_pt_regs()->tstate + 1) & TSTATE_CWP;
+		t->current_ds = ASI_P;
+		t->kregs->u_regs[UREG_G1] = sp; /* function */
+		t->kregs->u_regs[UREG_G2] = arg;
+		return 0;
 	}
 
+	parent_sf = ((struct sparc_stackf *) regs) - 1;
+	memcpy(child_trap_frame, parent_sf, child_stack_sz);
+	if (t->flags & _TIF_32BIT) {
+		sp &= 0x00000000ffffffffUL;
+		regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
+	}
+	t->kregs->u_regs[UREG_FP] = sp;
+	__thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] = 
+		(regs->tstate + 1) & TSTATE_CWP;
+	t->current_ds = ASI_AIUS;
+	if (sp != regs->u_regs[UREG_FP]) {
+		unsigned long csp;
+
+		csp = clone_stackframe(sp, regs->u_regs[UREG_FP]);
+		if (!csp)
+			return -EFAULT;
+		t->kregs->u_regs[UREG_FP] = csp;
+	}
+	if (t->utraps)
+		t->utraps[0]++;
+
 	/* Set the return value for the child. */
 	t->kregs->u_regs[UREG_I0] = current->pid;
 	t->kregs->u_regs[UREG_I1] = 1;
@@ -690,45 +681,6 @@
 	return 0;
 }
 
-/*
- * This is the mechanism for creating a new kernel thread.
- *
- * NOTE! Only a kernel-only process(ie the swapper or direct descendants
- * who haven't done an "execve()") should use this: it will work within
- * a system call from a "real" process, but the process memory space will
- * not be freed until both the parent and the child have exited.
- */
-pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
-{
-	long retval;
-
-	/* If the parent runs before fn(arg) is called by the child,
-	 * the input registers of this function can be clobbered.
-	 * So we stash 'fn' and 'arg' into global registers which
-	 * will not be modified by the parent.
-	 */
-	__asm__ __volatile__("mov %4, %%g2\n\t"	   /* Save FN into global */
-			     "mov %5, %%g3\n\t"	   /* Save ARG into global */
-			     "mov %1, %%g1\n\t"	   /* Clone syscall nr. */
-			     "mov %2, %%o0\n\t"	   /* Clone flags. */
-			     "mov 0, %%o1\n\t"	   /* usp arg == 0 */
-			     "t 0x6d\n\t"	   /* Linux/Sparc clone(). */
-			     "brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */
-			     " mov %%o0, %0\n\t"
-			     "jmpl %%g2, %%o7\n\t"   /* Call the function. */
-			     " mov %%g3, %%o0\n\t"   /* Set arg in delay. */
-			     "mov %3, %%g1\n\t"
-			     "t 0x6d\n\t"	   /* Linux/Sparc exit(). */
-			     /* Notreached by child. */
-			     "1:" :
-			     "=r" (retval) :
-			     "i" (__NR_clone), "r" (flags | CLONE_VM | CLONE_UNTRACED),
-			     "i" (__NR_exit),  "r" (fn), "r" (arg) :
-			     "g1", "g2", "g3", "o0", "o1", "memory", "cc");
-	return retval;
-}
-EXPORT_SYMBOL(kernel_thread);
-
 typedef struct {
 	union {
 		unsigned int	pr_regs[32];
@@ -795,41 +747,6 @@
 }
 EXPORT_SYMBOL(dump_fpu);
 
-/*
- * sparc_execve() executes a new program after the asm stub has set
- * things up for us.  This should basically do what I want it to.
- */
-asmlinkage int sparc_execve(struct pt_regs *regs)
-{
-	int error, base = 0;
-	struct filename *filename;
-
-	/* User register window flush is done by entry.S */
-
-	/* Check for indirect call. */
-	if (regs->u_regs[UREG_G1] == 0)
-		base = 1;
-
-	filename = getname((char __user *)regs->u_regs[base + UREG_I0]);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = do_execve(filename->name,
-			  (const char __user *const __user *)
-			  regs->u_regs[base + UREG_I1],
-			  (const char __user *const __user *)
-			  regs->u_regs[base + UREG_I2], regs);
-	putname(filename);
-	if (!error) {
-		fprs_write(0);
-		current_thread_info()->xfsr[0] = 0;
-		current_thread_info()->fpsaved[0] = 0;
-		regs->tstate &= ~TSTATE_PEF;
-	}
-out:
-	return error;
-}
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index c323981..03c7e92 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -396,42 +396,6 @@
         return ret;
 }
 
-/*
- * sparc32_execve() executes a new program after the asm stub has set
- * things up for us.  This should basically do what I want it to.
- */
-asmlinkage long sparc32_execve(struct pt_regs *regs)
-{
-	int error, base = 0;
-	struct filename *filename;
-
-	/* User register window flush is done by entry.S */
-
-	/* Check for indirect call. */
-	if ((u32)regs->u_regs[UREG_G1] == 0)
-		base = 1;
-
-	filename = getname(compat_ptr(regs->u_regs[base + UREG_I0]));
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-
-	error = compat_do_execve(filename->name,
-				 compat_ptr(regs->u_regs[base + UREG_I1]),
-				 compat_ptr(regs->u_regs[base + UREG_I2]), regs);
-
-	putname(filename);
-
-	if (!error) {
-		fprs_write(0);
-		current_thread_info()->xfsr[0] = 0;
-		current_thread_info()->fpsaved[0] = 0;
-		regs->tstate &= ~TSTATE_PEF;
-	}
-out:
-	return error;
-}
-
 #ifdef CONFIG_MODULES
 
 asmlinkage long sys32_init_module(void __user *umod, u32 len,
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 0c9b31b..a8e6eb0 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -258,27 +258,3 @@
 	up_read(&uts_sem);
 	return err;
 }
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename,
-		  const char *const argv[],
-		  const char *const envp[])
-{
-	long __res;
-	register long __g1 __asm__ ("g1") = __NR_execve;
-	register long __o0 __asm__ ("o0") = (long)(filename);
-	register long __o1 __asm__ ("o1") = (long)(argv);
-	register long __o2 __asm__ ("o2") = (long)(envp);
-	asm volatile ("t 0x10\n\t"
-		      "bcc 1f\n\t"
-		      "mov %%o0, %0\n\t"
-		      "sub %%g0, %%o0, %0\n\t"
-		      "1:\n\t"
-		      : "=r" (__res), "=&r" (__o0)
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1)
-		      : "cc");
-	return __res;
-}
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 11c6c96..adfe60e 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -729,25 +729,3 @@
 
 	return ret;
 }
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename,
-		  const char *const argv[],
-		  const char *const envp[])
-{
-	long __res;
-	register long __g1 __asm__ ("g1") = __NR_execve;
-	register long __o0 __asm__ ("o0") = (long)(filename);
-	register long __o1 __asm__ ("o1") = (long)(argv);
-	register long __o2 __asm__ ("o2") = (long)(envp);
-	asm volatile ("t 0x6d\n\t"
-		      "sub %%g0, %%o0, %0\n\t"
-		      "movcc %%xcc, %%o0, %0\n\t"
-		      : "=r" (__res), "=&r" (__o0)
-		      : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1)
-		      : "cc");
-	return __res;
-}
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S
index 7f5f65d..2ef41e6 100644
--- a/arch/sparc/kernel/syscalls.S
+++ b/arch/sparc/kernel/syscalls.S
@@ -1,23 +1,19 @@
 	/* SunOS's execv() call only specifies the argv argument, the
 	 * environment settings are the same as the calling processes.
 	 */
-sys_execve:
-	sethi	%hi(sparc_execve), %g1
-	ba,pt	%xcc, execve_merge
-	 or	%g1, %lo(sparc_execve), %g1
+sys64_execve:
+	set	sys_execve, %g1
+	jmpl	%g1, %g0
+	 flushw
 
 #ifdef CONFIG_COMPAT
 sunos_execv:
-	stx	%g0, [%sp + PTREGS_OFF + PT_V9_I2]
+	mov	%g0, %o2
 sys32_execve:
-	sethi	%hi(sparc32_execve), %g1
-	or	%g1, %lo(sparc32_execve), %g1
-#endif
-
-execve_merge:
-	flushw
+	set	compat_sys_execve, %g1
 	jmpl	%g1, %g0
-	 add	%sp, PTREGS_OFF, %o0
+	 flushw
+#endif
 
 	.align	32
 sys_sparc_pipe:
@@ -112,11 +108,16 @@
 ret_from_syscall:
 	/* Clear current_thread_info()->new_child. */
 	stb	%g0, [%g6 + TI_NEW_CHILD]
-	ldx	[%g6 + TI_FLAGS], %l0
 	call	schedule_tail
 	 mov	%g7, %o0
+	ldx	[%sp + PTREGS_OFF + PT_V9_I0], %o0
+	brnz,pt	%o0, ret_sys_call
+	 ldx	[%g6 + TI_FLAGS], %l0
+	ldx	[%sp + PTREGS_OFF + PT_V9_G1], %l1
+	call	%l1
+	 ldx	[%sp + PTREGS_OFF + PT_V9_G2], %o0
 	ba,pt	%xcc, ret_sys_call
-	 ldx	[%sp + PTREGS_OFF + PT_V9_I0], %o0
+	 mov	0, %o0
 
 	.globl	sparc_exit
 	.type	sparc_exit,#function
@@ -222,7 +223,6 @@
 	ldx	[%sp + PTREGS_OFF + PT_V9_TNPC], %l1 ! pc = npc
 
 2:
-	stb	%g0, [%g6 + TI_SYS_NOERROR]
 	/* System call success, clear Carry condition code. */
 	andn	%g3, %g2, %g3
 3:
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 3a58e0d..acad343 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -106,7 +106,7 @@
 /*40*/	.word sys_newlstat, sys_dup, sys_sparc_pipe, sys_times, sys_nis_syscall
 	.word sys_umount, sys_setgid, sys_getgid, sys_signal, sys_geteuid
 /*50*/	.word sys_getegid, sys_acct, sys_memory_ordering, sys_nis_syscall, sys_ioctl
-	.word sys_reboot, sys_nis_syscall, sys_symlink, sys_readlink, sys_execve
+	.word sys_reboot, sys_nis_syscall, sys_symlink, sys_readlink, sys64_execve
 /*60*/	.word sys_umask, sys_chroot, sys_newfstat, sys_fstat64, sys_getpagesize
 	.word sys_msync, sys_vfork, sys_pread64, sys_pwrite64, sys_nis_syscall
 /*70*/	.word sys_nis_syscall, sys_mmap, sys_nis_syscall, sys_64_munmap, sys_mprotect
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index b66a779..e7ecf15 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2688,8 +2688,8 @@
 		     TI_PRE_COUNT != offsetof(struct thread_info,
 					      preempt_count) ||
 		     TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
-		     TI_SYS_NOERROR != offsetof(struct thread_info,
-						syscall_noerror) ||
+		     TI_CURRENT_DS != offsetof(struct thread_info,
+						current_ds) ||
 		     TI_RESTART_BLOCK != offsetof(struct thread_info,
 						  restart_block) ||
 		     TI_KUNA_REGS != offsetof(struct thread_info,
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 9e28a11..85be1ca 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -624,7 +624,7 @@
 void prom_world(int enter)
 {
 	if (!enter)
-		set_fs((mm_segment_t) { get_thread_current_ds() });
+		set_fs(get_fs());
 
 	__asm__ __volatile__("flushw");
 }
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 875d008..ea7f61e 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -21,6 +21,8 @@
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
+	select GENERIC_KERNEL_THREAD
+	select GENERIC_KERNEL_EXECVE
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index 3063e6f..ca61fb4 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -275,18 +275,14 @@
 struct compat_sigaction;
 struct compat_siginfo;
 struct compat_sigaltstack;
-long compat_sys_execve(const char __user *path,
-		       compat_uptr_t __user *argv,
-		       compat_uptr_t __user *envp, struct pt_regs *);
 long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
 			     struct compat_sigaction __user *oact,
 			     size_t sigsetsize);
 long compat_sys_rt_sigqueueinfo(int pid, int sig,
 				struct compat_siginfo __user *uinfo);
-long compat_sys_rt_sigreturn(struct pt_regs *);
+long compat_sys_rt_sigreturn(void);
 long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr,
-			    struct pt_regs *);
+			    struct compat_sigaltstack __user *uoss_ptr);
 long compat_sys_truncate64(char __user *filename, u32 dummy, u32 low, u32 high);
 long compat_sys_ftruncate64(unsigned int fd, u32 dummy, u32 low, u32 high);
 long compat_sys_pread64(unsigned int fd, char __user *ubuf, size_t count,
@@ -303,12 +299,7 @@
 long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 				      struct compat_timespec __user *interval);
 
-/* These are the intvec_64.S trampolines. */
-long _compat_sys_execve(const char __user *path,
-			const compat_uptr_t __user *argv,
-			const compat_uptr_t __user *envp);
-long _compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr);
+/* Assembly trampoline to avoid clobbering r0. */
 long _compat_sys_rt_sigreturn(void);
 
 #endif /* _ASM_TILE_COMPAT_H */
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index f8ccf08..b73e103 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -148,6 +148,7 @@
 #define compat_start_thread(regs, ip, usp) do { \
 		regs->pc = ptr_to_compat_reg((void *)(ip)); \
 		regs->sp = ptr_to_compat_reg((void *)(usp)); \
+		single_step_execve();	\
 	} while (0)
 
 /*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 8c4dd9f..879073e 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -211,6 +211,7 @@
 {
 	regs->pc = pc;
 	regs->sp = usp;
+	single_step_execve();
 }
 
 /* Free all resources held by a thread. */
@@ -239,6 +240,9 @@
 #define KSTK_TOP(task)	(task_ksp0(task) - STACK_TOP_DELTA)
 #define task_pt_regs(task) \
   ((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
+#define current_pt_regs()                                   \
+  ((struct pt_regs *)((stack_pointer | (THREAD_SIZE - 1)) - \
+                      (KSTK_PTREGS_GAP - 1)) - 1)
 #define task_sp(task)	(task_pt_regs(task)->sp)
 #define task_pc(task)	(task_pt_regs(task)->pc)
 /* Aliases for pc and sp (used in fs/proc/array.c) */
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index 1d48c5f..b8f888c 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -68,7 +68,10 @@
 /* Support function for forking a new task. */
 void ret_from_fork(void);
 
-/* Called from ret_from_fork() when a new process starts up. */
+/* Support function for forking a new kernel thread. */
+void ret_from_kernel_thread(void *fn, void *arg);
+
+/* Called from ret_from_xxx() when a new process starts up. */
 struct task_struct *sim_notify_fork(struct task_struct *prev);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h
index 06f0464..369696d 100644
--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -51,8 +51,7 @@
 
 #ifndef __tilegx__
 /* mm/fault.c */
-long sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *);
-long _sys_cmpxchg_badaddr(unsigned long address);
+long sys_cmpxchg_badaddr(unsigned long address);
 #endif
 
 #ifdef CONFIG_COMPAT
@@ -63,14 +62,23 @@
 long sys_ftruncate64(unsigned int fd, loff_t length);
 #endif
 
+/* Provide versions of standard syscalls that use current_pt_regs(). */
+long sys_clone(unsigned long clone_flags, unsigned long newsp,
+		void __user *parent_tid, void __user *child_tid);
+long sys_execve(const char __user *filename,
+		 const char __user *const __user *argv,
+		 const char __user *const __user *envp);
+long sys_rt_sigreturn(void);
+long sys_sigaltstack(const stack_t __user *, stack_t __user *);
+#define sys_clone sys_clone
+#define sys_execve sys_execve
+#define sys_rt_sigreturn sys_rt_sigreturn
+#define sys_sigaltstack sys_sigaltstack
+
 /* These are the intvec*.S trampolines. */
-long _sys_sigaltstack(const stack_t __user *, stack_t __user *);
 long _sys_rt_sigreturn(void);
 long _sys_clone(unsigned long clone_flags, unsigned long newsp,
 		void __user *parent_tid, void __user *child_tid);
-long _sys_execve(const char __user *filename,
-		 const char __user *const __user *argv,
-		 const char __user *const __user *envp);
 
 #include <asm-generic/syscalls.h>
 
diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h
index 6e032a0..dab827d 100644
--- a/arch/tile/include/asm/unistd.h
+++ b/arch/tile/include/asm/unistd.h
@@ -16,4 +16,5 @@
 #define __ARCH_WANT_SYS_LLSEEK
 #endif
 #define __ARCH_WANT_SYS_NEWFSTATAT
+#define __ARCH_WANT_SYS_EXECVE
 #include <uapi/asm/unistd.h>
diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c
index d67459b..a2e8055 100644
--- a/arch/tile/kernel/compat.c
+++ b/arch/tile/kernel/compat.c
@@ -102,10 +102,9 @@
 #define compat_sys_fadvise64_64 sys32_fadvise64_64
 #define compat_sys_readahead sys32_readahead
 
-/* Call the trampolines to manage pt_regs where necessary. */
-#define compat_sys_execve _compat_sys_execve
-#define compat_sys_sigaltstack _compat_sys_sigaltstack
+/* Call the assembly trampolines where necessary. */
 #define compat_sys_rt_sigreturn _compat_sys_rt_sigreturn
+#undef sys_clone
 #define sys_clone _sys_clone
 
 /*
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index 08b4fe1..210a9bb 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -197,8 +197,7 @@
 }
 
 long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr,
-			    struct pt_regs *regs)
+			    struct compat_sigaltstack __user *uoss_ptr)
 {
 	stack_t uss, uoss;
 	int ret;
@@ -219,7 +218,7 @@
 	set_fs(KERNEL_DS);
 	ret = do_sigaltstack(uss_ptr ? (stack_t __user __force *)&uss : NULL,
 			     (stack_t __user __force *)&uoss,
-			     (unsigned long)compat_ptr(regs->sp));
+			     (unsigned long)compat_ptr(current_pt_regs()->sp));
 	set_fs(seg);
 	if (ret >= 0 && uoss_ptr)  {
 		if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(*uoss_ptr)) ||
@@ -232,8 +231,9 @@
 }
 
 /* The assembly shim for this function arranges to ignore the return value. */
-long compat_sys_rt_sigreturn(struct pt_regs *regs)
+long compat_sys_rt_sigreturn(void)
 {
+	struct pt_regs *regs = current_pt_regs();
 	struct compat_rt_sigframe __user *frame =
 		(struct compat_rt_sigframe __user *) compat_ptr(regs->sp);
 	sigset_t set;
@@ -248,7 +248,7 @@
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
-	if (compat_sys_sigaltstack(&frame->uc.uc_stack, NULL, regs) != 0)
+	if (compat_sys_sigaltstack(&frame->uc.uc_stack, NULL) != 0)
 		goto badframe;
 
 	return 0;
diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S
index c31637b..f116cb0 100644
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -28,17 +28,6 @@
 	STD_ENDPROC(current_text_addr)
 
 /*
- * Implement execve().  The i386 code has a note that forking from kernel
- * space results in no copy on write until the execve, so we should be
- * careful not to write to the stack here.
- */
-STD_ENTRY(kernel_execve)
-	moveli TREG_SYSCALL_NR_NAME, __NR_execve
-	swint1
-	jrp lr
-	STD_ENDPROC(kernel_execve)
-
-/*
  * We don't run this function directly, but instead copy it to a page
  * we map into every user process.  See vdso_setup().
  *
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index 6943515..f212bf7 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1291,6 +1291,21 @@
 	}
 	STD_ENDPROC(ret_from_fork)
 
+STD_ENTRY(ret_from_kernel_thread)
+	jal     sim_notify_fork
+	jal     schedule_tail
+	FEEDBACK_REENTER(ret_from_fork)
+	{
+	 move   r0, r31
+	 jalr   r30
+	}
+	FEEDBACK_REENTER(ret_from_kernel_thread)
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
+	STD_ENDPROC(ret_from_kernel_thread)
+
 	/*
 	 * Code for ill interrupt.
 	 */
@@ -1437,15 +1452,6 @@
 	panic   "Unhandled interrupt %#x: PC %#lx"
 	STD_ENDPROC(bad_intr)
 
-/* Put address of pt_regs in reg and jump. */
-#define PTREGS_SYSCALL(x, reg)                          \
-	STD_ENTRY(_##x);                                \
-	{                                               \
-	 PTREGS_PTR(reg, PTREGS_OFFSET_BASE);           \
-	 j      x                                       \
-	};                                              \
-	STD_ENDPROC(_##x)
-
 /*
  * Special-case sigreturn to not write r0 to the stack on return.
  * This is technically more efficient, but it also avoids difficulties
@@ -1461,12 +1467,9 @@
 	};                                              \
 	STD_ENDPROC(_##x)
 
-PTREGS_SYSCALL(sys_execve, r3)
-PTREGS_SYSCALL(sys_sigaltstack, r2)
 PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0)
-PTREGS_SYSCALL(sys_cmpxchg_badaddr, r1)
 
-/* Save additional callee-saves to pt_regs, put address in r4 and jump. */
+/* Save additional callee-saves to pt_regs and jump to standard function. */
 STD_ENTRY(_sys_clone)
 	push_extra_callee_saves r4
 	j       sys_clone
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 7c06d59..54bc9a6 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -1150,6 +1150,21 @@
 	}
 	STD_ENDPROC(ret_from_fork)
 
+STD_ENTRY(ret_from_kernel_thread)
+	jal     sim_notify_fork
+	jal     schedule_tail
+	FEEDBACK_REENTER(ret_from_fork)
+	{
+	 move   r0, r31
+	 jalr   r30
+	}
+	FEEDBACK_REENTER(ret_from_kernel_thread)
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
+	STD_ENDPROC(ret_from_kernel_thread)
+
 /* Various stub interrupt handlers and syscall handlers */
 
 STD_ENTRY_LOCAL(_kernel_double_fault)
@@ -1166,15 +1181,6 @@
 	panic   "Unhandled interrupt %#x: PC %#lx"
 	STD_ENDPROC(bad_intr)
 
-/* Put address of pt_regs in reg and jump. */
-#define PTREGS_SYSCALL(x, reg)                          \
-	STD_ENTRY(_##x);                                \
-	{                                               \
-	 PTREGS_PTR(reg, PTREGS_OFFSET_BASE);           \
-	 j      x                                       \
-	};                                              \
-	STD_ENDPROC(_##x)
-
 /*
  * Special-case sigreturn to not write r0 to the stack on return.
  * This is technically more efficient, but it also avoids difficulties
@@ -1190,16 +1196,12 @@
 	};                                              \
 	STD_ENDPROC(_##x)
 
-PTREGS_SYSCALL(sys_execve, r3)
-PTREGS_SYSCALL(sys_sigaltstack, r2)
 PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0)
 #ifdef CONFIG_COMPAT
-PTREGS_SYSCALL(compat_sys_execve, r3)
-PTREGS_SYSCALL(compat_sys_sigaltstack, r2)
 PTREGS_SYSCALL_SIGRETURN(compat_sys_rt_sigreturn, r0)
 #endif
 
-/* Save additional callee-saves to pt_regs, put address in r4 and jump. */
+/* Save additional callee-saves to pt_regs and jump to standard function. */
 STD_ENTRY(_sys_clone)
 	push_extra_callee_saves r4
 	j       sys_clone
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 307d010..1c20029 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -157,18 +157,50 @@
 static void save_arch_state(struct thread_struct *t);
 
 int copy_thread(unsigned long clone_flags, unsigned long sp,
-		unsigned long stack_size,
-		struct task_struct *p, struct pt_regs *regs)
+		unsigned long arg,
+		struct task_struct *p, struct pt_regs *unused)
 {
-	struct pt_regs *childregs;
+	struct pt_regs *childregs = task_pt_regs(p);
 	unsigned long ksp;
+	unsigned long *callee_regs;
 
 	/*
-	 * When creating a new kernel thread we pass sp as zero.
-	 * Assign it to a reasonable value now that we have the stack.
+	 * Set up the stack and stack pointer appropriately for the
+	 * new child to find itself woken up in __switch_to().
+	 * The callee-saved registers must be on the stack to be read;
+	 * the new task will then jump to assembly support to handle
+	 * calling schedule_tail(), etc., and (for userspace tasks)
+	 * returning to the context set up in the pt_regs.
 	 */
-	if (sp == 0 && regs->ex1 == PL_ICS_EX1(KERNEL_PL, 0))
-		sp = KSTK_TOP(p);
+	ksp = (unsigned long) childregs;
+	ksp -= C_ABI_SAVE_AREA_SIZE;   /* interrupt-entry save area */
+	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
+	ksp -= CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long);
+	callee_regs = (unsigned long *)ksp;
+	ksp -= C_ABI_SAVE_AREA_SIZE;   /* __switch_to() save area */
+	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
+	p->thread.ksp = ksp;
+
+	/* Record the pid of the task that created this one. */
+	p->thread.creator_pid = current->pid;
+
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		/* kernel thread */
+		memset(childregs, 0, sizeof(struct pt_regs));
+		memset(&callee_regs[2], 0,
+		       (CALLEE_SAVED_REGS_COUNT - 2) * sizeof(unsigned long));
+		callee_regs[0] = sp;   /* r30 = function */
+		callee_regs[1] = arg;  /* r31 = arg */
+		childregs->ex1 = PL_ICS_EX1(KERNEL_PL, 0);
+		p->thread.pc = (unsigned long) ret_from_kernel_thread;
+		return 0;
+	}
+
+	/*
+	 * Start new thread in ret_from_fork so it schedules properly
+	 * and then return from interrupt like the parent.
+	 */
+	p->thread.pc = (unsigned long) ret_from_fork;
 
 	/*
 	 * Do not clone step state from the parent; each thread
@@ -177,51 +209,26 @@
 	task_thread_info(p)->step_state = NULL;
 
 	/*
-	 * Start new thread in ret_from_fork so it schedules properly
-	 * and then return from interrupt like the parent.
-	 */
-	p->thread.pc = (unsigned long) ret_from_fork;
-
-	/* Save user stack top pointer so we can ID the stack vm area later. */
-	p->thread.usp0 = sp;
-
-	/* Record the pid of the process that created this one. */
-	p->thread.creator_pid = current->pid;
-
-	/*
 	 * Copy the registers onto the kernel stack so the
 	 * return-from-interrupt code will reload it into registers.
 	 */
-	childregs = task_pt_regs(p);
-	*childregs = *regs;
+	*childregs = *current_pt_regs();
 	childregs->regs[0] = 0;         /* return value is zero */
-	childregs->sp = sp;  /* override with new user stack pointer */
+	if (sp)
+		childregs->sp = sp;  /* override with new user stack pointer */
+	memcpy(callee_regs, &childregs->regs[CALLEE_SAVED_FIRST_REG],
+	       CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long));
+
+	/* Save user stack top pointer so we can ID the stack vm area later. */
+	p->thread.usp0 = childregs->sp;
 
 	/*
 	 * If CLONE_SETTLS is set, set "tp" in the new task to "r4",
 	 * which is passed in as arg #5 to sys_clone().
 	 */
 	if (clone_flags & CLONE_SETTLS)
-		childregs->tp = regs->regs[4];
+		childregs->tp = childregs->regs[4];
 
-	/*
-	 * Copy the callee-saved registers from the passed pt_regs struct
-	 * into the context-switch callee-saved registers area.
-	 * This way when we start the interrupt-return sequence, the
-	 * callee-save registers will be correctly in registers, which
-	 * is how we assume the compiler leaves them as we start doing
-	 * the normal return-from-interrupt path after calling C code.
-	 * Zero out the C ABI save area to mark the top of the stack.
-	 */
-	ksp = (unsigned long) childregs;
-	ksp -= C_ABI_SAVE_AREA_SIZE;   /* interrupt-entry save area */
-	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
-	ksp -= CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long);
-	memcpy((void *)ksp, &regs->regs[CALLEE_SAVED_FIRST_REG],
-	       CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long));
-	ksp -= C_ABI_SAVE_AREA_SIZE;   /* __switch_to() save area */
-	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
-	p->thread.ksp = ksp;
 
 #if CHIP_HAS_TILE_DMA()
 	/*
@@ -578,61 +585,13 @@
 }
 
 /* Note there is an implicit fifth argument if (clone_flags & CLONE_SETTLS). */
-SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
-		void __user *, parent_tidptr, void __user *, child_tidptr,
-		struct pt_regs *, regs)
+SYSCALL_DEFINE4(clone, unsigned long, clone_flags, unsigned long, newsp,
+		void __user *, parent_tidptr, void __user *, child_tidptr)
 {
-	if (!newsp)
-		newsp = regs->sp;
-	return do_fork(clone_flags, newsp, regs, 0,
+	return do_fork(clone_flags, newsp, current_pt_regs(), 0,
 		       parent_tidptr, child_tidptr);
 }
 
-/*
- * sys_execve() executes a new program.
- */
-SYSCALL_DEFINE4(execve, const char __user *, path,
-		const char __user *const __user *, argv,
-		const char __user *const __user *, envp,
-		struct pt_regs *, regs)
-{
-	long error;
-	struct filename *filename;
-
-	filename = getname(path);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = do_execve(filename->name, argv, envp, regs);
-	putname(filename);
-	if (error == 0)
-		single_step_execve();
-out:
-	return error;
-}
-
-#ifdef CONFIG_COMPAT
-long compat_sys_execve(const char __user *path,
-		       compat_uptr_t __user *argv,
-		       compat_uptr_t __user *envp,
-		       struct pt_regs *regs)
-{
-	long error;
-	struct filename *filename;
-
-	filename = getname(path);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = compat_do_execve(filename->name, argv, envp, regs);
-	putname(filename);
-	if (error == 0)
-		single_step_execve();
-out:
-	return error;
-}
-#endif
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct KBacktraceIterator kbt;
@@ -650,37 +609,6 @@
 	return 0;
 }
 
-/*
- * We pass in lr as zero (cleared in kernel_thread) and the caller
- * part of the backtrace ABI on the stack also zeroed (in copy_thread)
- * so that backtraces will stop with this function.
- * Note that we don't use r0, since copy_thread() clears it.
- */
-static void start_kernel_thread(int dummy, int (*fn)(int), int arg)
-{
-	do_exit(fn(arg));
-}
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
-{
-	struct pt_regs regs;
-
-	memset(&regs, 0, sizeof(regs));
-	regs.ex1 = PL_ICS_EX1(KERNEL_PL, 0);  /* run at kernel PL, no ICS */
-	regs.pc = (long) start_kernel_thread;
-	regs.flags = PT_FLAGS_CALLER_SAVES;   /* need to restore r1 and r2 */
-	regs.regs[1] = (long) fn;             /* function pointer */
-	regs.regs[2] = (long) arg;            /* parameter register */
-
-	/* Ok, create the new process.. */
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs,
-		       0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
 /* Flush thread state. */
 void flush_thread(void)
 {
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index 67efb65..657a7ac 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -37,10 +37,10 @@
 
 #define DEBUG_SIG 0
 
-SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss,
-		stack_t __user *, uoss, struct pt_regs *, regs)
+SYSCALL_DEFINE2(sigaltstack, const stack_t __user *, uss,
+		stack_t __user *, uoss)
 {
-	return do_sigaltstack(uss, uoss, regs->sp);
+	return do_sigaltstack(uss, uoss, current_pt_regs()->sp);
 }
 
 
@@ -83,8 +83,9 @@
 }
 
 /* The assembly shim for this function arranges to ignore the return value. */
-SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs)
+SYSCALL_DEFINE0(rt_sigreturn)
 {
+	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame =
 		(struct rt_sigframe __user *)(regs->sp);
 	sigset_t set;
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index b08095b..02ff5c0 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -106,14 +106,11 @@
 #define sys_readahead sys32_readahead
 #endif
 
-/* Call the trampolines to manage pt_regs where necessary. */
-#define sys_execve _sys_execve
-#define sys_sigaltstack _sys_sigaltstack
+/* Call the assembly trampolines where necessary. */
+#undef sys_rt_sigreturn
 #define sys_rt_sigreturn _sys_rt_sigreturn
+#undef sys_clone
 #define sys_clone _sys_clone
-#ifndef __tilegx__
-#define sys_cmpxchg_badaddr _sys_cmpxchg_badaddr
-#endif
 
 /*
  * Note that we can't include <linux/unistd.h> here since the header
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index fe811fa..3d2b81c 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -70,9 +70,10 @@
  * Synthesize the fault a PL0 process would get by doing a word-load of
  * an unaligned address or a high kernel address.
  */
-SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address,
-		struct pt_regs *, regs)
+SYSCALL_DEFINE1(cmpxchg_badaddr, unsigned long, address)
 {
+	struct pt_regs *regs = current_pt_regs();
+
 	if (address >= PAGE_OFFSET)
 		force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR,
 				     address, INT_DTLB_MISS, current, regs);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index d0ced10..d2db710 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -286,10 +286,8 @@
 
 int compat_do_execve(const char *filename, const compat_uptr_t __user *argv,
 		     const compat_uptr_t __user *envp, struct pt_regs *regs);
-#ifdef __ARCH_WANT_SYS_EXECVE
 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
 		     const compat_uptr_t __user *envp);
-#endif
 
 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		compat_ulong_t __user *outp, compat_ulong_t __user *exp,