Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc:
  sparc: Size mondo queues more sanely.
  sparc: Access kernel TSB using physical addressing when possible.
  sparc: Fix __atomic_add_unless() return value.
  sparc: use kbuild-generic support for true asm-generic header files
  sparc: Use popc when possible for ffs/__ffs/ffz.
  sparc: Set reboot-cmd using reboot data hypervisor call if available.
  sparc: Add some missing hypervisor API groups.
  sparc: Use hweight64() in popc emulation.
  sparc: Use popc if possible for hweight routines.
  sparc: Minor tweaks to Niagara page copy/clear.
  sparc: Sanitize cpu feature detection and reporting.
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 3c93f08..2c2e388 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,3 +16,8 @@
 header-y += uctx.h
 header-y += utrap.h
 header-y += watchdog.h
+
+generic-y += div64.h
+generic-y += local64.h
+generic-y += irq_regs.h
+generic-y += local.h
diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h
index 325e295..29011cc 100644
--- a/arch/sparc/include/asm/bitops_64.h
+++ b/arch/sparc/include/asm/bitops_64.h
@@ -26,61 +26,28 @@
 #define smp_mb__before_clear_bit()	barrier()
 #define smp_mb__after_clear_bit()	barrier()
 
-#include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/fls.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 
 #ifdef __KERNEL__
 
+extern int ffs(int x);
+extern unsigned long __ffs(unsigned long);
+
+#include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/sched.h>
-#include <asm-generic/bitops/ffs.h>
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
  * of bits set) of a N-bit word
  */
 
-#ifdef ULTRA_HAS_POPULATION_COUNT
+extern unsigned long __arch_hweight64(__u64 w);
+extern unsigned int __arch_hweight32(unsigned int w);
+extern unsigned int __arch_hweight16(unsigned int w);
+extern unsigned int __arch_hweight8(unsigned int w);
 
-static inline unsigned int __arch_hweight64(unsigned long w)
-{
-	unsigned int res;
-
-	__asm__ ("popc %1,%0" : "=r" (res) : "r" (w));
-	return res;
-}
-
-static inline unsigned int __arch_hweight32(unsigned int w)
-{
-	unsigned int res;
-
-	__asm__ ("popc %1,%0" : "=r" (res) : "r" (w & 0xffffffff));
-	return res;
-}
-
-static inline unsigned int __arch_hweight16(unsigned int w)
-{
-	unsigned int res;
-
-	__asm__ ("popc %1,%0" : "=r" (res) : "r" (w & 0xffff));
-	return res;
-}
-
-static inline unsigned int __arch_hweight8(unsigned int w)
-{
-	unsigned int res;
-
-	__asm__ ("popc %1,%0" : "=r" (res) : "r" (w & 0xff));
-	return res;
-}
-
-#else
-
-#include <asm-generic/bitops/arch_hweight.h>
-
-#endif
 #include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
 #endif /* __KERNEL__ */
diff --git a/arch/sparc/include/asm/div64.h b/arch/sparc/include/asm/div64.h
deleted file mode 100644
index 6cd978c..0000000
--- a/arch/sparc/include/asm/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h
index 64f7a00..7df8b7f 100644
--- a/arch/sparc/include/asm/elf_64.h
+++ b/arch/sparc/include/asm/elf_64.h
@@ -59,15 +59,33 @@
 #define R_SPARC_6		45
 
 /* Bits present in AT_HWCAP, primarily for Sparc32.  */
+#define HWCAP_SPARC_FLUSH       0x00000001
+#define HWCAP_SPARC_STBAR       0x00000002
+#define HWCAP_SPARC_SWAP        0x00000004
+#define HWCAP_SPARC_MULDIV      0x00000008
+#define HWCAP_SPARC_V9		0x00000010
+#define HWCAP_SPARC_ULTRA3	0x00000020
+#define HWCAP_SPARC_BLKINIT	0x00000040
+#define HWCAP_SPARC_N2		0x00000080
 
-#define HWCAP_SPARC_FLUSH       1    /* CPU supports flush instruction. */
-#define HWCAP_SPARC_STBAR       2
-#define HWCAP_SPARC_SWAP        4
-#define HWCAP_SPARC_MULDIV      8
-#define HWCAP_SPARC_V9		16
-#define HWCAP_SPARC_ULTRA3	32
-#define HWCAP_SPARC_BLKINIT	64
-#define HWCAP_SPARC_N2		128
+/* Solaris compatible AT_HWCAP bits. */
+#define AV_SPARC_MUL32		0x00000100 /* 32x32 multiply is efficient */
+#define AV_SPARC_DIV32		0x00000200 /* 32x32 divide is efficient */
+#define AV_SPARC_FSMULD		0x00000400 /* 'fsmuld' is efficient */
+#define AV_SPARC_V8PLUS		0x00000800 /* v9 insn available to 32bit */
+#define AV_SPARC_POPC		0x00001000 /* 'popc' is efficient */
+#define AV_SPARC_VIS		0x00002000 /* VIS insns available */
+#define AV_SPARC_VIS2		0x00004000 /* VIS2 insns available */
+#define AV_SPARC_ASI_BLK_INIT	0x00008000 /* block init ASIs available */
+#define AV_SPARC_FMAF		0x00010000 /* fused multiply-add */
+#define AV_SPARC_VIS3		0x00020000 /* VIS3 insns available */
+#define AV_SPARC_HPC		0x00040000 /* HPC insns available */
+#define AV_SPARC_RANDOM		0x00080000 /* 'random' insn available */
+#define AV_SPARC_TRANS		0x00100000 /* transaction insns available */
+#define AV_SPARC_FJFMAU		0x00200000 /* unfused multiply-add */
+#define AV_SPARC_IMA		0x00400000 /* integer multiply-add */
+#define AV_SPARC_ASI_CACHE_SPARING \
+				0x00800000 /* cache sparing ASIs available */
 
 #define CORE_DUMP_USE_REGSET
 
@@ -162,33 +180,8 @@
 #define ELF_ET_DYN_BASE		0x0000010000000000UL
 #define COMPAT_ELF_ET_DYN_BASE	0x0000000070000000UL
 
-
-/* This yields a mask that user programs can use to figure out what
-   instruction set this cpu supports.  */
-
-/* On Ultra, we support all of the v8 capabilities. */
-static inline unsigned int sparc64_elf_hwcap(void)
-{
-	unsigned int cap = (HWCAP_SPARC_FLUSH | HWCAP_SPARC_STBAR |
-			    HWCAP_SPARC_SWAP | HWCAP_SPARC_MULDIV |
-			    HWCAP_SPARC_V9);
-
-	if (tlb_type == cheetah || tlb_type == cheetah_plus)
-		cap |= HWCAP_SPARC_ULTRA3;
-	else if (tlb_type == hypervisor) {
-		if (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 ||
-		    sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
-		    sun4v_chip_type == SUN4V_CHIP_NIAGARA3)
-			cap |= HWCAP_SPARC_BLKINIT;
-		if (sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
-		    sun4v_chip_type == SUN4V_CHIP_NIAGARA3)
-			cap |= HWCAP_SPARC_N2;
-	}
-
-	return cap;
-}
-
-#define ELF_HWCAP	sparc64_elf_hwcap()
+extern unsigned long sparc64_elf_hwcap;
+#define ELF_HWCAP	sparc64_elf_hwcap
 
 /* This yields a string that ld.so will use to load implementation
    specific libraries for optimization.  This is more specific in
diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h
index 7a5f80d..015a761 100644
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -2927,6 +2927,13 @@
 #define HV_FAST_FIRE_GET_PERFREG	0x120
 #define HV_FAST_FIRE_SET_PERFREG	0x121
 
+#define HV_FAST_REBOOT_DATA_SET		0x172
+
+#ifndef __ASSEMBLY__
+extern unsigned long sun4v_reboot_data_set(unsigned long ra,
+					   unsigned long len);
+#endif
+
 /* Function numbers for HV_CORE_TRAP.  */
 #define HV_CORE_SET_VER			0x00
 #define HV_CORE_PUTCHAR			0x01
@@ -2940,11 +2947,17 @@
 #define HV_GRP_CORE			0x0001
 #define HV_GRP_INTR			0x0002
 #define HV_GRP_SOFT_STATE		0x0003
+#define HV_GRP_TM			0x0080
 #define HV_GRP_PCI			0x0100
 #define HV_GRP_LDOM			0x0101
 #define HV_GRP_SVC_CHAN			0x0102
 #define HV_GRP_NCS			0x0103
 #define HV_GRP_RNG			0x0104
+#define HV_GRP_PBOOT			0x0105
+#define HV_GRP_TPM			0x0107
+#define HV_GRP_SDIO			0x0108
+#define HV_GRP_SDIO_ERR			0x0109
+#define HV_GRP_REBOOT_DATA		0x0110
 #define HV_GRP_NIAG_PERF		0x0200
 #define HV_GRP_FIRE_PERF		0x0201
 #define HV_GRP_N2_CPU			0x0202
diff --git a/arch/sparc/include/asm/irq_regs.h b/arch/sparc/include/asm/irq_regs.h
deleted file mode 100644
index 3dd9c0b..0000000
--- a/arch/sparc/include/asm/irq_regs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/sparc/include/asm/local.h b/arch/sparc/include/asm/local.h
deleted file mode 100644
index bc80815..0000000
--- a/arch/sparc/include/asm/local.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _SPARC_LOCAL_H
-#define _SPARC_LOCAL_H
-
-#include <asm-generic/local.h>
-
-#endif
diff --git a/arch/sparc/include/asm/local64.h b/arch/sparc/include/asm/local64.h
deleted file mode 100644
index 36c93b5..0000000
--- a/arch/sparc/include/asm/local64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local64.h>
diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h
index 83c571d..1a8afd1 100644
--- a/arch/sparc/include/asm/tsb.h
+++ b/arch/sparc/include/asm/tsb.h
@@ -133,29 +133,6 @@
 	sub	TSB, 0x8, TSB;   \
 	TSB_STORE(TSB, TAG);
 
-#define KTSB_LOAD_QUAD(TSB, REG) \
-	ldda		[TSB] ASI_NUCLEUS_QUAD_LDD, REG;
-
-#define KTSB_STORE(ADDR, VAL) \
-	stxa		VAL, [ADDR] ASI_N;
-
-#define KTSB_LOCK_TAG(TSB, REG1, REG2)	\
-99:	lduwa	[TSB] ASI_N, REG1;	\
-	sethi	%hi(TSB_TAG_LOCK_HIGH), REG2;\
-	andcc	REG1, REG2, %g0;	\
-	bne,pn	%icc, 99b;		\
-	 nop;				\
-	casa	[TSB] ASI_N, REG1, REG2;\
-	cmp	REG1, REG2;		\
-	bne,pn	%icc, 99b;		\
-	 nop;				\
-
-#define KTSB_WRITE(TSB, TTE, TAG) \
-	add	TSB, 0x8, TSB;   \
-	stxa	TTE, [TSB] ASI_N;     \
-	sub	TSB, 0x8, TSB;   \
-	stxa	TAG, [TSB] ASI_N;
-
 	/* Do a kernel page table walk.  Leaves physical PTE pointer in
 	 * REG1.  Jumps to FAIL_LABEL on early page table walk termination.
 	 * VADDR will not be clobbered, but REG2 will.
@@ -239,6 +216,8 @@
 	(KERNEL_TSB_SIZE_BYTES / 16)
 #define KERNEL_TSB4M_NENTRIES	4096
 
+#define KTSB_PHYS_SHIFT		15
+
 	/* Do a kernel TSB lookup at tl>0 on VADDR+TAG, branch to OK_LABEL
 	 * on TSB hit.  REG1, REG2, REG3, and REG4 are used as temporaries
 	 * and the found TTE will be left in REG1.  REG3 and REG4 must
@@ -247,13 +226,22 @@
 	 * VADDR and TAG will be preserved and not clobbered by this macro.
 	 */
 #define KERN_TSB_LOOKUP_TL1(VADDR, TAG, REG1, REG2, REG3, REG4, OK_LABEL) \
-	sethi		%hi(swapper_tsb), REG1; \
+661:	sethi		%hi(swapper_tsb), REG1;			\
 	or		REG1, %lo(swapper_tsb), REG1; \
+	.section	.swapper_tsb_phys_patch, "ax"; \
+	.word		661b; \
+	.previous; \
+661:	nop; \
+	.section	.tsb_ldquad_phys_patch, "ax"; \
+	.word		661b; \
+	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
+	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
+	.previous; \
 	srlx		VADDR, PAGE_SHIFT, REG2; \
 	and		REG2, (KERNEL_TSB_NENTRIES - 1), REG2; \
 	sllx		REG2, 4, REG2; \
 	add		REG1, REG2, REG2; \
-	KTSB_LOAD_QUAD(REG2, REG3); \
+	TSB_LOAD_QUAD(REG2, REG3); \
 	cmp		REG3, TAG; \
 	be,a,pt		%xcc, OK_LABEL; \
 	 mov		REG4, REG1;
@@ -263,12 +251,21 @@
 	 * we can make use of that for the index computation.
 	 */
 #define KERN_TSB4M_LOOKUP_TL1(TAG, REG1, REG2, REG3, REG4, OK_LABEL) \
-	sethi		%hi(swapper_4m_tsb), REG1; \
+661:	sethi		%hi(swapper_4m_tsb), REG1;	     \
 	or		REG1, %lo(swapper_4m_tsb), REG1; \
+	.section	.swapper_4m_tsb_phys_patch, "ax"; \
+	.word		661b; \
+	.previous; \
+661:	nop; \
+	.section	.tsb_ldquad_phys_patch, "ax"; \
+	.word		661b; \
+	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
+	sllx		REG1, KTSB_PHYS_SHIFT, REG1; \
+	.previous; \
 	and		TAG, (KERNEL_TSB4M_NENTRIES - 1), REG2; \
 	sllx		REG2, 4, REG2; \
 	add		REG1, REG2, REG2; \
-	KTSB_LOAD_QUAD(REG2, REG3); \
+	TSB_LOAD_QUAD(REG2, REG3); \
 	cmp		REG3, TAG; \
 	be,a,pt		%xcc, OK_LABEL; \
 	 mov		REG4, REG1;
diff --git a/arch/sparc/kernel/cpu.c b/arch/sparc/kernel/cpu.c
index 17cf290..9810fd8 100644
--- a/arch/sparc/kernel/cpu.c
+++ b/arch/sparc/kernel/cpu.c
@@ -396,6 +396,7 @@
 		   , cpu_data(0).clock_tick
 #endif
 		);
+	cpucap_info(m);
 #ifdef CONFIG_SMP
 	smp_bogo(m);
 #endif
diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c
index dd1342c..490e541 100644
--- a/arch/sparc/kernel/ds.c
+++ b/arch/sparc/kernel/ds.c
@@ -15,12 +15,15 @@
 #include <linux/reboot.h>
 #include <linux/cpu.h>
 
+#include <asm/hypervisor.h>
 #include <asm/ldc.h>
 #include <asm/vio.h>
 #include <asm/mdesc.h>
 #include <asm/head.h>
 #include <asm/irq.h>
 
+#include "kernel.h"
+
 #define DRV_MODULE_NAME		"ds"
 #define PFX DRV_MODULE_NAME	": "
 #define DRV_MODULE_VERSION	"1.0"
@@ -828,18 +831,32 @@
 	}
 }
 
+static char full_boot_str[256] __attribute__((aligned(32)));
+static int reboot_data_supported;
+
 void ldom_reboot(const char *boot_command)
 {
 	/* Don't bother with any of this if the boot_command
 	 * is empty.
 	 */
 	if (boot_command && strlen(boot_command)) {
-		char full_boot_str[256];
+		unsigned long len;
 
 		strcpy(full_boot_str, "boot ");
 		strcpy(full_boot_str + strlen("boot "), boot_command);
+		len = strlen(full_boot_str);
 
-		ldom_set_var("reboot-command", full_boot_str);
+		if (reboot_data_supported) {
+			unsigned long ra = kimage_addr_to_ra(full_boot_str);
+			unsigned long hv_ret;
+
+			hv_ret = sun4v_reboot_data_set(ra, len);
+			if (hv_ret != HV_EOK)
+				pr_err("SUN4V: Unable to set reboot data "
+				       "hv_ret=%lu\n", hv_ret);
+		} else {
+			ldom_set_var("reboot-command", full_boot_str);
+		}
 	}
 	sun4v_mach_sir();
 }
@@ -1237,6 +1254,15 @@
 
 static int __init ds_init(void)
 {
+	unsigned long hv_ret, major, minor;
+
+	hv_ret = sun4v_get_version(HV_GRP_REBOOT_DATA, &major, &minor);
+	if (hv_ret == HV_EOK) {
+		pr_info("SUN4V: Reboot data supported (maj=%lu,min=%lu).\n",
+			major, minor);
+		reboot_data_supported = 1;
+	}
+
 	kthread_run(ds_thread, NULL, "kldomd");
 
 	return vio_register_driver(&ds_driver);
diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h
index d1f1361..e27f8ea 100644
--- a/arch/sparc/kernel/entry.h
+++ b/arch/sparc/kernel/entry.h
@@ -42,6 +42,20 @@
 extern void fpload(unsigned long *fpregs, unsigned long *fsr);
 
 #else /* CONFIG_SPARC32 */
+struct popc_3insn_patch_entry {
+	unsigned int	addr;
+	unsigned int	insns[3];
+};
+extern struct popc_3insn_patch_entry __popc_3insn_patch,
+	__popc_3insn_patch_end;
+
+struct popc_6insn_patch_entry {
+	unsigned int	addr;
+	unsigned int	insns[6];
+};
+extern struct popc_6insn_patch_entry __popc_6insn_patch,
+	__popc_6insn_patch_end;
+
 extern void __init per_cpu_patch(void);
 extern void __init sun4v_patch(void);
 extern void __init boot_cpu_id_too_large(int cpu);
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index c752603..0eac1b2 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -559,7 +559,7 @@
 	 nop
 	call	niagara_patch_bzero
 	 nop
-	call	niagara2_patch_pageops
+	call	niagara_patch_pageops
 	 nop
 
 	ba,a,pt	%xcc, 80f
diff --git a/arch/sparc/kernel/hvapi.c b/arch/sparc/kernel/hvapi.c
index d306e64..c2d055d 100644
--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@@ -28,11 +28,17 @@
 	{ .group = HV_GRP_CORE,		.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_INTR,					},
 	{ .group = HV_GRP_SOFT_STATE,				},
+	{ .group = HV_GRP_TM,					},
 	{ .group = HV_GRP_PCI,		.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_LDOM,					},
 	{ .group = HV_GRP_SVC_CHAN,	.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_NCS,		.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_RNG,					},
+	{ .group = HV_GRP_PBOOT,				},
+	{ .group = HV_GRP_TPM,					},
+	{ .group = HV_GRP_SDIO,					},
+	{ .group = HV_GRP_SDIO_ERR,				},
+	{ .group = HV_GRP_REBOOT_DATA,				},
 	{ .group = HV_GRP_NIAG_PERF,	.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_FIRE_PERF,				},
 	{ .group = HV_GRP_N2_CPU,				},
diff --git a/arch/sparc/kernel/hvcalls.S b/arch/sparc/kernel/hvcalls.S
index 8a5f35f..58d60de 100644
--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@@ -798,3 +798,10 @@
 	retl
 	 nop
 ENDPROC(sun4v_niagara2_setperf)
+
+ENTRY(sun4v_reboot_data_set)
+	mov	HV_FAST_REBOOT_DATA_SET, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+ENDPROC(sun4v_reboot_data_set)
diff --git a/arch/sparc/kernel/kernel.h b/arch/sparc/kernel/kernel.h
index 6f6544c..fd6c36b 100644
--- a/arch/sparc/kernel/kernel.h
+++ b/arch/sparc/kernel/kernel.h
@@ -4,12 +4,27 @@
 #include <linux/interrupt.h>
 
 #include <asm/traps.h>
+#include <asm/head.h>
+#include <asm/io.h>
 
 /* cpu.c */
 extern const char *sparc_pmu_type;
 extern unsigned int fsr_storage;
 extern int ncpus_probed;
 
+#ifdef CONFIG_SPARC64
+/* setup_64.c */
+struct seq_file;
+extern void cpucap_info(struct seq_file *);
+
+static inline unsigned long kimage_addr_to_ra(const char *p)
+{
+	unsigned long val = (unsigned long) p;
+
+	return kern_base + (val - KERNBASE);
+}
+#endif
+
 #ifdef CONFIG_SPARC32
 /* cpu.c */
 extern void cpu_probe(void);
diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S
index 1d36147..79f3103 100644
--- a/arch/sparc/kernel/ktlb.S
+++ b/arch/sparc/kernel/ktlb.S
@@ -47,16 +47,16 @@
 kvmap_itlb_vmalloc_addr:
 	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_itlb_longpath)
 
-	KTSB_LOCK_TAG(%g1, %g2, %g7)
+	TSB_LOCK_TAG(%g1, %g2, %g7)
 
 	/* Load and check PTE.  */
 	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
 	mov		1, %g7
 	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
 	brgez,a,pn	%g5, kvmap_itlb_longpath
-	 KTSB_STORE(%g1, %g7)
+	 TSB_STORE(%g1, %g7)
 
-	KTSB_WRITE(%g1, %g5, %g6)
+	TSB_WRITE(%g1, %g5, %g6)
 
 	/* fallthrough to TLB load */
 
@@ -102,9 +102,9 @@
 kvmap_itlb_obp:
 	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_itlb_longpath)
 
-	KTSB_LOCK_TAG(%g1, %g2, %g7)
+	TSB_LOCK_TAG(%g1, %g2, %g7)
 
-	KTSB_WRITE(%g1, %g5, %g6)
+	TSB_WRITE(%g1, %g5, %g6)
 
 	ba,pt		%xcc, kvmap_itlb_load
 	 nop
@@ -112,17 +112,17 @@
 kvmap_dtlb_obp:
 	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_dtlb_longpath)
 
-	KTSB_LOCK_TAG(%g1, %g2, %g7)
+	TSB_LOCK_TAG(%g1, %g2, %g7)
 
-	KTSB_WRITE(%g1, %g5, %g6)
+	TSB_WRITE(%g1, %g5, %g6)
 
 	ba,pt		%xcc, kvmap_dtlb_load
 	 nop
 
 	.align		32
 kvmap_dtlb_tsb4m_load:
-	KTSB_LOCK_TAG(%g1, %g2, %g7)
-	KTSB_WRITE(%g1, %g5, %g6)
+	TSB_LOCK_TAG(%g1, %g2, %g7)
+	TSB_WRITE(%g1, %g5, %g6)
 	ba,pt		%xcc, kvmap_dtlb_load
 	 nop
 
@@ -222,16 +222,16 @@
 kvmap_dtlb_vmalloc_addr:
 	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
 
-	KTSB_LOCK_TAG(%g1, %g2, %g7)
+	TSB_LOCK_TAG(%g1, %g2, %g7)
 
 	/* Load and check PTE.  */
 	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
 	mov		1, %g7
 	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
 	brgez,a,pn	%g5, kvmap_dtlb_longpath
-	 KTSB_STORE(%g1, %g7)
+	 TSB_STORE(%g1, %g7)
 
-	KTSB_WRITE(%g1, %g5, %g6)
+	TSB_WRITE(%g1, %g5, %g6)
 
 	/* fallthrough to TLB load */
 
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 42f28c7..acaebb6 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -508,6 +508,8 @@
 }
 EXPORT_SYMBOL(mdesc_node_name);
 
+static u64 max_cpus = 64;
+
 static void __init report_platform_properties(void)
 {
 	struct mdesc_handle *hp = mdesc_grab();
@@ -543,8 +545,10 @@
 	if (v)
 		printk("PLATFORM: watchdog-max-timeout [%llu ms]\n", *v);
 	v = mdesc_get_property(hp, pn, "max-cpus", NULL);
-	if (v)
-		printk("PLATFORM: max-cpus [%llu]\n", *v);
+	if (v) {
+		max_cpus = *v;
+		printk("PLATFORM: max-cpus [%llu]\n", max_cpus);
+	}
 
 #ifdef CONFIG_SMP
 	{
@@ -715,7 +719,7 @@
 }
 
 static void __cpuinit get_one_mondo_bits(const u64 *p, unsigned int *mask,
-					 unsigned char def)
+					 unsigned long def, unsigned long max)
 {
 	u64 val;
 
@@ -726,6 +730,9 @@
 	if (!val || val >= 64)
 		goto use_default;
 
+	if (val > max)
+		val = max;
+
 	*mask = ((1U << val) * 64U) - 1U;
 	return;
 
@@ -736,19 +743,28 @@
 static void __cpuinit get_mondo_data(struct mdesc_handle *hp, u64 mp,
 				     struct trap_per_cpu *tb)
 {
+	static int printed;
 	const u64 *val;
 
 	val = mdesc_get_property(hp, mp, "q-cpu-mondo-#bits", NULL);
-	get_one_mondo_bits(val, &tb->cpu_mondo_qmask, 7);
+	get_one_mondo_bits(val, &tb->cpu_mondo_qmask, 7, ilog2(max_cpus * 2));
 
 	val = mdesc_get_property(hp, mp, "q-dev-mondo-#bits", NULL);
-	get_one_mondo_bits(val, &tb->dev_mondo_qmask, 7);
+	get_one_mondo_bits(val, &tb->dev_mondo_qmask, 7, 8);
 
 	val = mdesc_get_property(hp, mp, "q-resumable-#bits", NULL);
-	get_one_mondo_bits(val, &tb->resum_qmask, 6);
+	get_one_mondo_bits(val, &tb->resum_qmask, 6, 7);
 
 	val = mdesc_get_property(hp, mp, "q-nonresumable-#bits", NULL);
-	get_one_mondo_bits(val, &tb->nonresum_qmask, 2);
+	get_one_mondo_bits(val, &tb->nonresum_qmask, 2, 2);
+	if (!printed++) {
+		pr_info("SUN4V: Mondo queue sizes "
+			"[cpu(%u) dev(%u) r(%u) nr(%u)]\n",
+			tb->cpu_mondo_qmask + 1,
+			tb->dev_mondo_qmask + 1,
+			tb->resum_qmask + 1,
+			tb->nonresum_qmask + 1);
+	}
 }
 
 static void * __cpuinit mdesc_iterate_over_cpus(void *(*func)(struct mdesc_handle *, u64, int, void *), void *arg, cpumask_t *mask)
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index c4dd099..3e9daea 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -29,6 +29,7 @@
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/initrd.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/io.h>
@@ -46,6 +47,8 @@
 #include <asm/mmu.h>
 #include <asm/ns87303.h>
 #include <asm/btext.h>
+#include <asm/elf.h>
+#include <asm/mdesc.h>
 
 #ifdef CONFIG_IP_PNP
 #include <net/ipconfig.h>
@@ -269,6 +272,40 @@
 	sun4v_hvapi_init();
 }
 
+static void __init popc_patch(void)
+{
+	struct popc_3insn_patch_entry *p3;
+	struct popc_6insn_patch_entry *p6;
+
+	p3 = &__popc_3insn_patch;
+	while (p3 < &__popc_3insn_patch_end) {
+		unsigned long i, addr = p3->addr;
+
+		for (i = 0; i < 3; i++) {
+			*(unsigned int *) (addr +  (i * 4)) = p3->insns[i];
+			wmb();
+			__asm__ __volatile__("flush	%0"
+					     : : "r" (addr +  (i * 4)));
+		}
+
+		p3++;
+	}
+
+	p6 = &__popc_6insn_patch;
+	while (p6 < &__popc_6insn_patch_end) {
+		unsigned long i, addr = p6->addr;
+
+		for (i = 0; i < 6; i++) {
+			*(unsigned int *) (addr +  (i * 4)) = p6->insns[i];
+			wmb();
+			__asm__ __volatile__("flush	%0"
+					     : : "r" (addr +  (i * 4)));
+		}
+
+		p6++;
+	}
+}
+
 #ifdef CONFIG_SMP
 void __init boot_cpu_id_too_large(int cpu)
 {
@@ -278,6 +315,154 @@
 }
 #endif
 
+/* On Ultra, we support all of the v8 capabilities. */
+unsigned long sparc64_elf_hwcap = (HWCAP_SPARC_FLUSH | HWCAP_SPARC_STBAR |
+				   HWCAP_SPARC_SWAP | HWCAP_SPARC_MULDIV |
+				   HWCAP_SPARC_V9);
+EXPORT_SYMBOL(sparc64_elf_hwcap);
+
+static const char *hwcaps[] = {
+	"flush", "stbar", "swap", "muldiv", "v9",
+	"ultra3", "blkinit", "n2",
+
+	/* These strings are as they appear in the machine description
+	 * 'hwcap-list' property for cpu nodes.
+	 */
+	"mul32", "div32", "fsmuld", "v8plus", "popc", "vis", "vis2",
+	"ASIBlkInit", "fmaf", "vis3", "hpc", "random", "trans", "fjfmau",
+	"ima", "cspare",
+};
+
+void cpucap_info(struct seq_file *m)
+{
+	unsigned long caps = sparc64_elf_hwcap;
+	int i, printed = 0;
+
+	seq_puts(m, "cpucaps\t\t: ");
+	for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
+		unsigned long bit = 1UL << i;
+		if (caps & bit) {
+			seq_printf(m, "%s%s",
+				   printed ? "," : "", hwcaps[i]);
+			printed++;
+		}
+	}
+	seq_putc(m, '\n');
+}
+
+static void __init report_hwcaps(unsigned long caps)
+{
+	int i, printed = 0;
+
+	printk(KERN_INFO "CPU CAPS: [");
+	for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
+		unsigned long bit = 1UL << i;
+		if (caps & bit) {
+			printk(KERN_CONT "%s%s",
+			       printed ? "," : "", hwcaps[i]);
+			if (++printed == 8) {
+				printk(KERN_CONT "]\n");
+				printk(KERN_INFO "CPU CAPS: [");
+				printed = 0;
+			}
+		}
+	}
+	printk(KERN_CONT "]\n");
+}
+
+static unsigned long __init mdesc_cpu_hwcap_list(void)
+{
+	struct mdesc_handle *hp;
+	unsigned long caps = 0;
+	const char *prop;
+	int len;
+	u64 pn;
+
+	hp = mdesc_grab();
+	if (!hp)
+		return 0;
+
+	pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "cpu");
+	if (pn == MDESC_NODE_NULL)
+		goto out;
+
+	prop = mdesc_get_property(hp, pn, "hwcap-list", &len);
+	if (!prop)
+		goto out;
+
+	while (len) {
+		int i, plen;
+
+		for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
+			unsigned long bit = 1UL << i;
+
+			if (!strcmp(prop, hwcaps[i])) {
+				caps |= bit;
+				break;
+			}
+		}
+
+		plen = strlen(prop) + 1;
+		prop += plen;
+		len -= plen;
+	}
+
+out:
+	mdesc_release(hp);
+	return caps;
+}
+
+/* This yields a mask that user programs can use to figure out what
+ * instruction set this cpu supports.
+ */
+static void __init init_sparc64_elf_hwcap(void)
+{
+	unsigned long cap = sparc64_elf_hwcap;
+	unsigned long mdesc_caps;
+
+	if (tlb_type == cheetah || tlb_type == cheetah_plus)
+		cap |= HWCAP_SPARC_ULTRA3;
+	else if (tlb_type == hypervisor) {
+		if (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 ||
+		    sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
+		    sun4v_chip_type == SUN4V_CHIP_NIAGARA3)
+			cap |= HWCAP_SPARC_BLKINIT;
+		if (sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
+		    sun4v_chip_type == SUN4V_CHIP_NIAGARA3)
+			cap |= HWCAP_SPARC_N2;
+	}
+
+	cap |= (AV_SPARC_MUL32 | AV_SPARC_DIV32 | AV_SPARC_V8PLUS);
+
+	mdesc_caps = mdesc_cpu_hwcap_list();
+	if (!mdesc_caps) {
+		if (tlb_type == spitfire)
+			cap |= AV_SPARC_VIS;
+		if (tlb_type == cheetah || tlb_type == cheetah_plus)
+			cap |= AV_SPARC_VIS | AV_SPARC_VIS2;
+		if (tlb_type == cheetah_plus)
+			cap |= AV_SPARC_POPC;
+		if (tlb_type == hypervisor) {
+			if (sun4v_chip_type == SUN4V_CHIP_NIAGARA1)
+				cap |= AV_SPARC_ASI_BLK_INIT;
+			if (sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
+			    sun4v_chip_type == SUN4V_CHIP_NIAGARA3)
+				cap |= (AV_SPARC_VIS | AV_SPARC_VIS2 |
+					AV_SPARC_ASI_BLK_INIT |
+					AV_SPARC_POPC);
+			if (sun4v_chip_type == SUN4V_CHIP_NIAGARA3)
+				cap |= (AV_SPARC_VIS3 | AV_SPARC_HPC |
+					AV_SPARC_FMAF);
+		}
+	}
+	sparc64_elf_hwcap = cap | mdesc_caps;
+
+	report_hwcaps(sparc64_elf_hwcap);
+
+	if (sparc64_elf_hwcap & AV_SPARC_POPC)
+		popc_patch();
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	/* Initialize PROM console and command line. */
@@ -337,6 +522,7 @@
 	init_cur_cpu_trap(current_thread_info());
 
 	paging_init();
+	init_sparc64_elf_hwcap();
 }
 
 extern int stop_a_enabled;
diff --git a/arch/sparc/kernel/sparc_ksyms_64.c b/arch/sparc/kernel/sparc_ksyms_64.c
index 372ad59..83b47ab 100644
--- a/arch/sparc/kernel/sparc_ksyms_64.c
+++ b/arch/sparc/kernel/sparc_ksyms_64.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/bitops.h>
 
 #include <asm/system.h>
 #include <asm/cpudata.h>
@@ -38,5 +39,15 @@
 EXPORT_SYMBOL(sun4v_niagara2_getperf);
 EXPORT_SYMBOL(sun4v_niagara2_setperf);
 
+/* from hweight.S */
+EXPORT_SYMBOL(__arch_hweight8);
+EXPORT_SYMBOL(__arch_hweight16);
+EXPORT_SYMBOL(__arch_hweight32);
+EXPORT_SYMBOL(__arch_hweight64);
+
+/* from ffs_ffz.S */
+EXPORT_SYMBOL(ffs);
+EXPORT_SYMBOL(__ffs);
+
 /* Exporting a symbol from /init/main.c */
 EXPORT_SYMBOL(saved_command_line);
diff --git a/arch/sparc/kernel/sstate.c b/arch/sparc/kernel/sstate.c
index 8cdbe59..c59af54 100644
--- a/arch/sparc/kernel/sstate.c
+++ b/arch/sparc/kernel/sstate.c
@@ -14,15 +14,10 @@
 #include <asm/head.h>
 #include <asm/io.h>
 
+#include "kernel.h"
+
 static int hv_supports_soft_state;
 
-static unsigned long kimage_addr_to_ra(const char *p)
-{
-	unsigned long val = (unsigned long) p;
-
-	return kern_base + (val - KERNBASE);
-}
-
 static void do_set_sstate(unsigned long state, const char *msg)
 {
 	unsigned long err;
diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c
index 35cff16..76e4ac1 100644
--- a/arch/sparc/kernel/unaligned_64.c
+++ b/arch/sparc/kernel/unaligned_64.c
@@ -22,6 +22,7 @@
 #include <linux/bitops.h>
 #include <linux/perf_event.h>
 #include <linux/ratelimit.h>
+#include <linux/bitops.h>
 #include <asm/fpumacro.h>
 
 enum direction {
@@ -373,16 +374,11 @@
 	}
 }
 
-static char popc_helper[] = {
-0, 1, 1, 2, 1, 2, 2, 3,
-1, 2, 2, 3, 2, 3, 3, 4, 
-};
-
 int handle_popc(u32 insn, struct pt_regs *regs)
 {
-	u64 value;
-	int ret, i, rd = ((insn >> 25) & 0x1f);
 	int from_kernel = (regs->tstate & TSTATE_PRIV) != 0;
+	int ret, rd = ((insn >> 25) & 0x1f);
+	u64 value;
 	                        
 	perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0);
 	if (insn & 0x2000) {
@@ -392,10 +388,7 @@
 		maybe_flush_windows(0, insn & 0x1f, rd, from_kernel);
 		value = fetch_reg(insn & 0x1f, regs);
 	}
-	for (ret = 0, i = 0; i < 16; i++) {
-		ret += popc_helper[value & 0xf];
-		value >>= 4;
-	}
+	ret = hweight64(value);
 	if (rd < 16) {
 		if (rd)
 			regs->u_regs[rd] = ret;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index c022075..0e16056 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -107,7 +107,26 @@
 		*(.sun4v_2insn_patch)
 		__sun4v_2insn_patch_end = .;
 	}
-
+	.swapper_tsb_phys_patch : {
+		__swapper_tsb_phys_patch = .;
+		*(.swapper_tsb_phys_patch)
+		__swapper_tsb_phys_patch_end = .;
+	}
+	.swapper_4m_tsb_phys_patch : {
+		__swapper_4m_tsb_phys_patch = .;
+		*(.swapper_4m_tsb_phys_patch)
+		__swapper_4m_tsb_phys_patch_end = .;
+	}
+	.popc_3insn_patch : {
+		__popc_3insn_patch = .;
+		*(.popc_3insn_patch)
+		__popc_3insn_patch_end = .;
+	}
+	.popc_6insn_patch : {
+		__popc_6insn_patch = .;
+		*(.popc_6insn_patch)
+		__popc_6insn_patch_end = .;
+	}
 	PERCPU_SECTION(SMP_CACHE_BYTES)
 
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 7f01b8f..a3fc437 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -31,13 +31,13 @@
 lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o
 
 lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o
-lib-$(CONFIG_SPARC64) +=  NG2patch.o NG2page.o
+lib-$(CONFIG_SPARC64) +=  NG2patch.o
 
 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
 
 lib-$(CONFIG_SPARC64) += copy_in_user.o user_fixup.o memmove.o
-lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o
+lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o
 
 obj-y                 += iomap.o
 obj-$(CONFIG_SPARC32) += atomic32.o
diff --git a/arch/sparc/lib/NG2page.S b/arch/sparc/lib/NG2page.S
deleted file mode 100644
index 73b6b7c..0000000
--- a/arch/sparc/lib/NG2page.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/* NG2page.S: Niagara-2 optimized clear and copy page.
- *
- * Copyright (C) 2007 (davem@davemloft.net)
- */
-
-#include <asm/asi.h>
-#include <asm/page.h>
-#include <asm/visasm.h>
-
-	.text
-	.align	32
-
-	/* This is heavily simplified from the sun4u variants
-	 * because Niagara-2 does not have any D-cache aliasing issues.
-	 */
-NG2copy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
-	prefetch	[%o1 + 0x00], #one_read
-	prefetch	[%o1 + 0x40], #one_read
-	VISEntryHalf
-	set		PAGE_SIZE, %g7
-	sub		%o0, %o1, %g3
-1:	stxa		%g0, [%o1 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	subcc		%g7, 64, %g7
-	ldda		[%o1] ASI_BLK_P, %f0
-	stda		%f0, [%o1 + %g3] ASI_BLK_P
-	add		%o1, 64, %o1
-	bne,pt		%xcc, 1b
-	 prefetch	[%o1 + 0x40], #one_read
-	membar		#Sync
-	VISExitHalf
-	retl
-	 nop
-
-#define BRANCH_ALWAYS	0x10680000
-#define NOP		0x01000000
-#define NG_DO_PATCH(OLD, NEW)	\
-	sethi	%hi(NEW), %g1; \
-	or	%g1, %lo(NEW), %g1; \
-	sethi	%hi(OLD), %g2; \
-	or	%g2, %lo(OLD), %g2; \
-	sub	%g1, %g2, %g1; \
-	sethi	%hi(BRANCH_ALWAYS), %g3; \
-	sll	%g1, 11, %g1; \
-	srl	%g1, 11 + 2, %g1; \
-	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
-	or	%g3, %g1, %g3; \
-	stw	%g3, [%g2]; \
-	sethi	%hi(NOP), %g3; \
-	or	%g3, %lo(NOP), %g3; \
-	stw	%g3, [%g2 + 0x4]; \
-	flush	%g2;
-
-	.globl	niagara2_patch_pageops
-	.type	niagara2_patch_pageops,#function
-niagara2_patch_pageops:
-	NG_DO_PATCH(copy_user_page, NG2copy_user_page)
-	NG_DO_PATCH(_clear_page, NGclear_page)
-	NG_DO_PATCH(clear_user_page, NGclear_user_page)
-	retl
-	 nop
-	.size	niagara2_patch_pageops,.-niagara2_patch_pageops
diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S
index 428920d..b9e790b 100644
--- a/arch/sparc/lib/NGpage.S
+++ b/arch/sparc/lib/NGpage.S
@@ -16,55 +16,91 @@
 	 */
 
 NGcopy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
-	prefetch	[%o1 + 0x00], #one_read
-	mov		8, %g1
-	mov		16, %g2
-	mov		24, %g3
+	save		%sp, -192, %sp
+	rd		%asi, %g3
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
 	set		PAGE_SIZE, %g7
+	prefetch	[%i1 + 0x00], #one_read
+	prefetch	[%i1 + 0x40], #one_read
 
-1:	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
-	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
-	prefetch	[%o1 + 0x40], #one_read
-	add		%o1, 32, %o1
-	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
-	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
-	add		%o1, 32, %o1
-	add		%o0, 32, %o0
-	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	subcc		%g7, 64, %g7
+1:	prefetch	[%i1 + 0x80], #one_read
+	prefetch	[%i1 + 0xc0], #one_read
+	ldda		[%i1 + 0x00] %asi, %o2
+	ldda		[%i1 + 0x10] %asi, %o4
+	ldda		[%i1 + 0x20] %asi, %l2
+	ldda		[%i1 + 0x30] %asi, %l4
+	stxa		%o2, [%i0 + 0x00] %asi
+	stxa		%o3, [%i0 + 0x08] %asi
+	stxa		%o4, [%i0 + 0x10] %asi
+	stxa		%o5, [%i0 + 0x18] %asi
+	stxa		%l2, [%i0 + 0x20] %asi
+	stxa		%l3, [%i0 + 0x28] %asi
+	stxa		%l4, [%i0 + 0x30] %asi
+	stxa		%l5, [%i0 + 0x38] %asi
+	ldda		[%i1 + 0x40] %asi, %o2
+	ldda		[%i1 + 0x50] %asi, %o4
+	ldda		[%i1 + 0x60] %asi, %l2
+	ldda		[%i1 + 0x70] %asi, %l4
+	stxa		%o2, [%i0 + 0x40] %asi
+	stxa		%o3, [%i0 + 0x48] %asi
+	stxa		%o4, [%i0 + 0x50] %asi
+	stxa		%o5, [%i0 + 0x58] %asi
+	stxa		%l2, [%i0 + 0x60] %asi
+	stxa		%l3, [%i0 + 0x68] %asi
+	stxa		%l4, [%i0 + 0x70] %asi
+	stxa		%l5, [%i0 + 0x78] %asi
+	add		%i1, 128, %i1
+	subcc		%g7, 128, %g7
 	bne,pt		%xcc, 1b
-	 add		%o0, 32, %o0
+	 add		%i0, 128, %i0
+	wr		%g3, 0x0, %asi
 	membar		#Sync
-	retl
-	 nop
+	ret
+	 restore
 
-	.globl		NGclear_page, NGclear_user_page
+	.align		32
 NGclear_page:		/* %o0=dest */
 NGclear_user_page:	/* %o0=dest, %o1=vaddr */
-	mov		8, %g1
-	mov		16, %g2
-	mov		24, %g3
+	rd		%asi, %g3
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
 	set		PAGE_SIZE, %g7
 
-1:	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	add		%o0, 32, %o0
-	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
-	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
-	subcc		%g7, 64, %g7
+1:	stxa		%g0, [%o0 + 0x00] %asi
+	stxa		%g0, [%o0 + 0x08] %asi
+	stxa		%g0, [%o0 + 0x10] %asi
+	stxa		%g0, [%o0 + 0x18] %asi
+	stxa		%g0, [%o0 + 0x20] %asi
+	stxa		%g0, [%o0 + 0x28] %asi
+	stxa		%g0, [%o0 + 0x30] %asi
+	stxa		%g0, [%o0 + 0x38] %asi
+	stxa		%g0, [%o0 + 0x40] %asi
+	stxa		%g0, [%o0 + 0x48] %asi
+	stxa		%g0, [%o0 + 0x50] %asi
+	stxa		%g0, [%o0 + 0x58] %asi
+	stxa		%g0, [%o0 + 0x60] %asi
+	stxa		%g0, [%o0 + 0x68] %asi
+	stxa		%g0, [%o0 + 0x70] %asi
+	stxa		%g0, [%o0 + 0x78] %asi
+	stxa		%g0, [%o0 + 0x80] %asi
+	stxa		%g0, [%o0 + 0x88] %asi
+	stxa		%g0, [%o0 + 0x90] %asi
+	stxa		%g0, [%o0 + 0x98] %asi
+	stxa		%g0, [%o0 + 0xa0] %asi
+	stxa		%g0, [%o0 + 0xa8] %asi
+	stxa		%g0, [%o0 + 0xb0] %asi
+	stxa		%g0, [%o0 + 0xb8] %asi
+	stxa		%g0, [%o0 + 0xc0] %asi
+	stxa		%g0, [%o0 + 0xc8] %asi
+	stxa		%g0, [%o0 + 0xd0] %asi
+	stxa		%g0, [%o0 + 0xd8] %asi
+	stxa		%g0, [%o0 + 0xe0] %asi
+	stxa		%g0, [%o0 + 0xe8] %asi
+	stxa		%g0, [%o0 + 0xf0] %asi
+	stxa		%g0, [%o0 + 0xf8] %asi
+	subcc		%g7, 256, %g7
 	bne,pt		%xcc, 1b
-	 add		%o0, 32, %o0
+	 add		%o0, 256, %o0
+	wr		%g3, 0x0, %asi
 	membar		#Sync
 	retl
 	 nop
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 8600eb2..1d32b54 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -65,7 +65,7 @@
 	if (ret != u)
 		v->counter += a;
 	spin_unlock_irqrestore(ATOMIC_HASH(v), flags);
-	return ret != u;
+	return ret;
 }
 EXPORT_SYMBOL(__atomic_add_unless);
 
diff --git a/arch/sparc/lib/ffs.S b/arch/sparc/lib/ffs.S
new file mode 100644
index 0000000..b39389f
--- /dev/null
+++ b/arch/sparc/lib/ffs.S
@@ -0,0 +1,84 @@
+#include <linux/linkage.h>
+
+	.register	%g2,#scratch
+
+	.text
+	.align	32
+
+ENTRY(ffs)
+	brnz,pt	%o0, 1f
+	 mov	1, %o1
+	retl
+	 clr	%o0
+	nop
+	nop
+ENTRY(__ffs)
+	sllx	%o0, 32, %g1		/* 1  */
+	srlx	%o0, 32, %g2
+
+	clr	%o1			/* 2  */
+	movrz	%g1, %g2, %o0
+
+	movrz	%g1, 32, %o1		/* 3  */
+1:	clr	%o2
+
+	sllx	%o0, (64 - 16), %g1	/* 4  */
+	srlx	%o0, 16, %g2
+
+	movrz	%g1, %g2, %o0		/* 5  */
+	clr	%o3
+
+	movrz	%g1, 16, %o2		/* 6  */
+	clr	%o4
+
+	and	%o0, 0xff, %g1		/* 7  */
+	srlx	%o0, 8, %g2
+
+	movrz	%g1, %g2, %o0		/* 8  */
+	clr	%o5
+
+	movrz	%g1, 8, %o3		/* 9  */
+	add	%o2, %o1, %o2
+
+	and	%o0, 0xf, %g1		/* 10 */
+	srlx	%o0, 4, %g2
+
+	movrz	%g1, %g2, %o0		/* 11 */
+	add	%o2, %o3, %o2
+
+	movrz	%g1, 4, %o4		/* 12 */
+
+	and	%o0, 0x3, %g1		/* 13 */
+	srlx	%o0, 2, %g2
+
+	movrz	%g1, %g2, %o0		/* 14 */
+	add	%o2, %o4, %o2
+
+	movrz	%g1, 2, %o5		/* 15 */
+
+	and	%o0, 0x1, %g1		/* 16 */
+
+	add	%o2, %o5, %o2		/* 17 */
+	xor	%g1, 0x1, %g1
+
+	retl				/* 18 */
+	 add	%o2, %g1, %o0
+ENDPROC(ffs)
+ENDPROC(__ffs)
+
+	.section	.popc_6insn_patch, "ax"
+	.word		ffs
+	brz,pn	%o0, 98f
+	 neg	%o0, %g1
+	xnor	%o0, %g1, %o1
+	popc	%o1, %o0
+98:	retl
+	 nop
+	.word		__ffs
+	neg	%o0, %g1
+	xnor	%o0, %g1, %o1
+	popc	%o1, %o0
+	retl
+	 sub	%o0, 1, %o0
+	nop
+	.previous
diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
new file mode 100644
index 0000000..95414e0
--- /dev/null
+++ b/arch/sparc/lib/hweight.S
@@ -0,0 +1,51 @@
+#include <linux/linkage.h>
+
+	.text
+	.align	32
+ENTRY(__arch_hweight8)
+	ba,pt	%xcc, __sw_hweight8
+	 nop
+	nop
+ENDPROC(__arch_hweight8)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight8
+	sllx		%o0, 64-8, %g1
+	retl
+	 popc		%g1, %o0
+	.previous
+
+ENTRY(__arch_hweight16)
+	ba,pt	%xcc, __sw_hweight16
+	 nop
+	nop
+ENDPROC(__arch_hweight16)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight16
+	sllx		%o0, 64-16, %g1
+	retl
+	 popc		%g1, %o0
+	.previous
+
+ENTRY(__arch_hweight32)
+	ba,pt	%xcc, __sw_hweight32
+	 nop
+	nop
+ENDPROC(__arch_hweight32)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight32
+	sllx		%o0, 64-32, %g1
+	retl
+	 popc		%g1, %o0
+	.previous
+
+ENTRY(__arch_hweight64)
+	ba,pt	%xcc, __sw_hweight64
+	 nop
+	nop
+ENDPROC(__arch_hweight64)
+	.section	.popc_3insn_patch, "ax"
+	.word		__arch_hweight64
+	retl
+	 popc		%o0, %o0
+	nop
+	.previous
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 3fd8e18..adfac23 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1597,6 +1597,42 @@
 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 
+static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
+{
+	pa >>= KTSB_PHYS_SHIFT;
+
+	while (start < end) {
+		unsigned int *ia = (unsigned int *)(unsigned long)*start;
+
+		ia[0] = (ia[0] & ~0x3fffff) | (pa >> 10);
+		__asm__ __volatile__("flush	%0" : : "r" (ia));
+
+		ia[1] = (ia[1] & ~0x3ff) | (pa & 0x3ff);
+		__asm__ __volatile__("flush	%0" : : "r" (ia + 1));
+
+		start++;
+	}
+}
+
+static void ktsb_phys_patch(void)
+{
+	extern unsigned int __swapper_tsb_phys_patch;
+	extern unsigned int __swapper_tsb_phys_patch_end;
+	extern unsigned int __swapper_4m_tsb_phys_patch;
+	extern unsigned int __swapper_4m_tsb_phys_patch_end;
+	unsigned long ktsb_pa;
+
+	ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
+	patch_one_ktsb_phys(&__swapper_tsb_phys_patch,
+			    &__swapper_tsb_phys_patch_end, ktsb_pa);
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	ktsb_pa = (kern_base +
+		   ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
+	patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch,
+			    &__swapper_4m_tsb_phys_patch_end, ktsb_pa);
+#endif
+}
+
 static void __init sun4v_ktsb_init(void)
 {
 	unsigned long ktsb_pa;
@@ -1716,8 +1752,10 @@
 		sun4u_pgprot_init();
 
 	if (tlb_type == cheetah_plus ||
-	    tlb_type == hypervisor)
+	    tlb_type == hypervisor) {
 		tsb_phys_patch();
+		ktsb_phys_patch();
+	}
 
 	if (tlb_type == hypervisor) {
 		sun4v_patch_tlb_handlers();