[SPARC64]: More fully work around Spitfire Errata 51.

It appears that a memory barrier soon after a mispredicted
branch, not just in the delay slot, can cause the hang
condition of this cpu errata.

So move them out-of-line, and explicitly put them into
a "branch always, predict taken" delay slot which should
fully kill this problem.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c
index 2803bc7..425c60c 100644
--- a/arch/sparc64/kernel/pci_iommu.c
+++ b/arch/sparc64/kernel/pci_iommu.c
@@ -466,7 +466,7 @@
 		if (!limit)
 			break;
 		udelay(1);
-		membar("#LoadLoad");
+		rmb();
 	}
 	if (!limit)
 		printk(KERN_WARNING "pci_strbuf_flush: flushflag timeout "
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 07424b0..6625543 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -103,7 +103,7 @@
 		 * other cpus see our increasing idleness for the buddy
 		 * redistribution algorithm.  -DaveM
 		 */
-		membar("#StoreStore | #StoreLoad");
+		membar_storeload_storestore();
 	}
 }
 
diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c
index 89f5e01..e09ddf9 100644
--- a/arch/sparc64/kernel/sbus.c
+++ b/arch/sparc64/kernel/sbus.c
@@ -147,7 +147,7 @@
 		if (!limit)
 			break;
 		udelay(1);
-		membar("#LoadLoad");
+		rmb();
 	}
 	if (!limit)
 		printk(KERN_WARNING "sbus_strbuf_flush: flushflag timeout "
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index b1ed230..aecccd0 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -877,11 +877,12 @@
 			unsigned long page = (unsigned long)
 				page_address(pte_page(*ptep));
 
-			__asm__ __volatile__(
-			"	membar	#StoreStore\n"
-			"	flush	%0 + %1"
-			: : "r" (page), "r" (address & (PAGE_SIZE - 1))
-			: "memory");
+			wmb();
+			__asm__ __volatile__("flush	%0 + %1"
+					     : /* no outputs */
+					     : "r" (page),
+					       "r" (address & (PAGE_SIZE - 1))
+					     : "memory");
 		}
 		pte_unmap(ptep);
 		preempt_enable();
@@ -1292,11 +1293,12 @@
 			unsigned long page = (unsigned long)
 				page_address(pte_page(*ptep));
 
-			__asm__ __volatile__(
-			"	membar	#StoreStore\n"
-			"	flush	%0 + %1"
-			: : "r" (page), "r" (address & (PAGE_SIZE - 1))
-			: "memory");
+			wmb();
+			__asm__ __volatile__("flush	%0 + %1"
+					     : /* no outputs */
+					     : "r" (page),
+					       "r" (address & (PAGE_SIZE - 1))
+					     : "memory");
 		}
 		pte_unmap(ptep);
 		preempt_enable();
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index b9b4249..b4fc6a5 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -144,7 +144,7 @@
 	current->active_mm = &init_mm;
 
 	while (!cpu_isset(cpuid, smp_commenced_mask))
-		membar("#LoadLoad");
+		rmb();
 
 	cpu_set(cpuid, cpu_online_map);
 }
@@ -184,11 +184,11 @@
 	for (i = 0; i < NUM_ITERS; i++) {
 		t0 = tick_ops->get_tick();
 		go[MASTER] = 1;
-		membar("#StoreLoad");
+		membar_storeload();
 		while (!(tm = go[SLAVE]))
-			membar("#LoadLoad");
+			rmb();
 		go[SLAVE] = 0;
-		membar("#StoreStore");
+		wmb();
 		t1 = tick_ops->get_tick();
 
 		if (t1 - t0 < best_t1 - best_t0)
@@ -221,7 +221,7 @@
 	go[MASTER] = 1;
 
 	while (go[MASTER])
-		membar("#LoadLoad");
+		rmb();
 
 	local_irq_save(flags);
 	{
@@ -273,21 +273,21 @@
 
 	/* wait for client to be ready */
 	while (!go[MASTER])
-		membar("#LoadLoad");
+		rmb();
 
 	/* now let the client proceed into his loop */
 	go[MASTER] = 0;
-	membar("#StoreLoad");
+	membar_storeload();
 
 	spin_lock_irqsave(&itc_sync_lock, flags);
 	{
 		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; i++) {
 			while (!go[MASTER])
-				membar("#LoadLoad");
+				rmb();
 			go[MASTER] = 0;
-			membar("#StoreStore");
+			wmb();
 			go[SLAVE] = tick_ops->get_tick();
-			membar("#StoreLoad");
+			membar_storeload();
 		}
 	}
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
@@ -927,11 +927,11 @@
 		       smp_processor_id());
 #endif
 		penguins_are_doing_time = 1;
-		membar("#StoreStore | #LoadStore");
+		membar_storestore_loadstore();
 		atomic_inc(&smp_capture_registry);
 		smp_cross_call(&xcall_capture, 0, 0, 0);
 		while (atomic_read(&smp_capture_registry) != ncpus)
-			membar("#LoadLoad");
+			rmb();
 #ifdef CAPTURE_DEBUG
 		printk("done\n");
 #endif
@@ -947,7 +947,7 @@
 		       smp_processor_id());
 #endif
 		penguins_are_doing_time = 0;
-		membar("#StoreStore | #StoreLoad");
+		membar_storeload_storestore();
 		atomic_dec(&smp_capture_registry);
 	}
 }
@@ -970,9 +970,9 @@
 	save_alternate_globals(global_save);
 	prom_world(1);
 	atomic_inc(&smp_capture_registry);
-	membar("#StoreLoad | #StoreStore");
+	membar_storeload_storestore();
 	while (penguins_are_doing_time)
-		membar("#LoadLoad");
+		rmb();
 	restore_alternate_globals(global_save);
 	atomic_dec(&smp_capture_registry);
 	prom_world(0);
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 0764b93..a3ea697 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -406,3 +406,12 @@
 EXPORT_SYMBOL(xor_vis_5);
 
 EXPORT_SYMBOL(prom_palette);
+
+/* memory barriers */
+EXPORT_SYMBOL(mb);
+EXPORT_SYMBOL(rmb);
+EXPORT_SYMBOL(wmb);
+EXPORT_SYMBOL(membar_storeload);
+EXPORT_SYMBOL(membar_storeload_storestore);
+EXPORT_SYMBOL(membar_storeload_loadload);
+EXPORT_SYMBOL(membar_storestore_loadstore);
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index 40dbeec..6201f10 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -12,7 +12,7 @@
 	 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
 	 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
 	 copy_in_user.o user_fixup.o memmove.o \
-	 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
+	 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o mb.o
 
 lib-$(CONFIG_DEBUG_SPINLOCK) += debuglocks.o
 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
index 7f6ccc4..f5f0b55 100644
--- a/arch/sparc64/lib/debuglocks.c
+++ b/arch/sparc64/lib/debuglocks.c
@@ -61,7 +61,7 @@
 			     : "=r" (val)
 			     : "r" (&(lock->lock))
 			     : "memory");
-	membar("#StoreLoad | #StoreStore");
+	membar_storeload_storestore();
 	if (val) {
 		while (lock->lock) {
 			if (!--stuck) {
@@ -69,7 +69,7 @@
 					show(str, lock, caller);
 				stuck = INIT_STUCK;
 			}
-			membar("#LoadLoad");
+			rmb();
 		}
 		goto again;
 	}
@@ -90,7 +90,7 @@
 			     : "=r" (val)
 			     : "r" (&(lock->lock))
 			     : "memory");
-	membar("#StoreLoad | #StoreStore");
+	membar_storeload_storestore();
 	if (!val) {
 		lock->owner_pc = ((unsigned int)caller);
 		lock->owner_cpu = cpu;
@@ -107,7 +107,7 @@
 {
 	lock->owner_pc = 0;
 	lock->owner_cpu = NO_PROC_ID;
-	membar("#StoreStore | #LoadStore");
+	membar_storestore_loadstore();
 	lock->lock = 0;
 	current->thread.smp_lock_count--;
 }
@@ -129,7 +129,7 @@
 				show_read(str, rw, caller);
 			stuck = INIT_STUCK;
 		}
-		membar("#LoadLoad");
+		rmb();
 	}
 	/* Try once to increment the counter.  */
 	__asm__ __volatile__(
@@ -142,7 +142,7 @@
 "2:"	: "=r" (val)
 	: "0" (&(rw->lock))
 	: "g1", "g7", "memory");
-	membar("#StoreLoad | #StoreStore");
+	membar_storeload_storestore();
 	if (val)
 		goto wlock_again;
 	rw->reader_pc[cpu] = ((unsigned int)caller);
@@ -201,7 +201,7 @@
 				show_write(str, rw, caller);
 			stuck = INIT_STUCK;
 		}
-		membar("#LoadLoad");
+		rmb();
 	}
 
 	/* Try to acuire the write bit.  */
@@ -256,7 +256,7 @@
 					show_write(str, rw, caller);
 				stuck = INIT_STUCK;
 			}
-			membar("#LoadLoad");
+			rmb();
 		}
 		goto wlock_again;
 	}
diff --git a/arch/sparc64/lib/mb.S b/arch/sparc64/lib/mb.S
new file mode 100644
index 0000000..4004f74
--- /dev/null
+++ b/arch/sparc64/lib/mb.S
@@ -0,0 +1,73 @@
+/* mb.S: Out of line memory barriers.
+ *
+ * Copyright (C) 2005 David S. Miller (davem@davemloft.net)
+ */
+
+	/* These are here in an effort to more fully work around
+	 * Spitfire Errata #51.  Essentially, if a memory barrier
+	 * occurs soon after a mispredicted branch, the chip can stop
+	 * executing instructions until a trap occurs.  Therefore, if
+	 * interrupts are disabled, the chip can hang forever.
+	 *
+	 * It used to be believed that the memory barrier had to be
+	 * right in the delay slot, but a case has been traced
+	 * recently wherein the memory barrier was one instruction
+	 * after the branch delay slot and the chip still hung.  The
+	 * offending sequence was the following in sym_wakeup_done()
+	 * of the sym53c8xx_2 driver:
+	 *
+	 *	call	sym_ccb_from_dsa, 0
+	 *	 movge	%icc, 0, %l0
+	 *	brz,pn	%o0, .LL1303
+	 *	 mov	%o0, %l2
+	 *	membar	#LoadLoad
+	 *
+	 * The branch has to be mispredicted for the bug to occur.
+	 * Therefore, we put the memory barrier explicitly into a
+	 * "branch always, predicted taken" delay slot to avoid the
+	 * problem case.
+	 */
+
+	.text
+
+99:	retl
+	 nop
+
+	.globl	mb
+mb:	ba,pt	%xcc, 99b
+	 membar	#LoadLoad | #LoadStore | #StoreStore | #StoreLoad
+	.size	mb, .-mb
+
+	.globl	rmb
+rmb:	ba,pt	%xcc, 99b
+	 membar	#LoadLoad
+	.size	rmb, .-rmb
+
+	.globl	wmb
+wmb:	ba,pt	%xcc, 99b
+	 membar	#StoreStore
+	.size	wmb, .-wmb
+
+	.globl	membar_storeload
+membar_storeload:
+	ba,pt	%xcc, 99b
+	 membar	#StoreLoad
+	.size	membar_storeload, .-membar_storeload
+
+	.globl	membar_storeload_storestore
+membar_storeload_storestore:
+	ba,pt	%xcc, 99b
+	 membar	#StoreLoad | #StoreStore
+	.size	membar_storeload_storestore, .-membar_storeload_storestore
+
+	.globl	membar_storeload_loadload
+membar_storeload_loadload:
+	ba,pt	%xcc, 99b
+	 membar	#StoreLoad | #LoadLoad
+	.size	membar_storeload_loadload, .-membar_storeload_loadload
+
+	.globl	membar_storestore_loadstore
+membar_storestore_loadstore:
+	ba,pt	%xcc, 99b
+	 membar	#StoreStore | #LoadStore
+	.size	membar_storestore_loadstore, .-membar_storestore_loadstore
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index 15b4cfe..302efbc 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -737,7 +737,8 @@
 extern u32 tl0_solaris[8];
 #define update_ttable(x) 										\
 	tl0_solaris[3] = (((long)(x) - (long)tl0_solaris - 3) >> 2) | 0x40000000;			\
-	__asm__ __volatile__ ("membar #StoreStore; flush %0" : : "r" (&tl0_solaris[3]))
+	wmb();		\
+	__asm__ __volatile__ ("flush %0" : : "r" (&tl0_solaris[3]))
 #else
 #endif	
 
@@ -761,7 +762,8 @@
 	entry64_personality_patch |=
 		(offsetof(struct task_struct, personality) +
 		 (sizeof(unsigned long) - 1));
-	__asm__ __volatile__("membar #StoreStore; flush %0"
+	wmb();
+	__asm__ __volatile__("flush %0"
 			     : : "r" (&entry64_personality_patch));
 	return 0;
 }
diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h
index d80f3379..e175afc 100644
--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -72,10 +72,10 @@
 
 /* Atomic operations are already serializing */
 #ifdef CONFIG_SMP
-#define smp_mb__before_atomic_dec()	membar("#StoreLoad | #LoadLoad")
-#define smp_mb__after_atomic_dec()	membar("#StoreLoad | #StoreStore")
-#define smp_mb__before_atomic_inc()	membar("#StoreLoad | #LoadLoad")
-#define smp_mb__after_atomic_inc()	membar("#StoreLoad | #StoreStore")
+#define smp_mb__before_atomic_dec()	membar_storeload_loadload();
+#define smp_mb__after_atomic_dec()	membar_storeload_storestore();
+#define smp_mb__before_atomic_inc()	membar_storeload_loadload();
+#define smp_mb__after_atomic_inc()	membar_storeload_storestore();
 #else
 #define smp_mb__before_atomic_dec()	barrier()
 #define smp_mb__after_atomic_dec()	barrier()
diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h
index 9c5e719..6388b83 100644
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -72,8 +72,8 @@
 }
 
 #ifdef CONFIG_SMP
-#define smp_mb__before_clear_bit()	membar("#StoreLoad | #LoadLoad")
-#define smp_mb__after_clear_bit()	membar("#StoreLoad | #StoreStore")
+#define smp_mb__before_clear_bit()	membar_storeload_loadload()
+#define smp_mb__after_clear_bit()	membar_storeload_storestore()
 #else
 #define smp_mb__before_clear_bit()	barrier()
 #define smp_mb__after_clear_bit()	barrier()
diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
index d265bf6..a02c437 100644
--- a/include/asm-sparc64/spinlock.h
+++ b/include/asm-sparc64/spinlock.h
@@ -43,7 +43,7 @@
 #define spin_is_locked(lp)  ((lp)->lock != 0)
 
 #define spin_unlock_wait(lp)	\
-do {	membar("#LoadLoad");	\
+do {	rmb();			\
 } while((lp)->lock)
 
 static inline void _raw_spin_lock(spinlock_t *lock)
@@ -129,7 +129,7 @@
 #define spin_is_locked(__lock)	((__lock)->lock != 0)
 #define spin_unlock_wait(__lock)	\
 do { \
-	membar("#LoadLoad"); \
+	rmb(); \
 } while((__lock)->lock)
 
 extern void _do_spin_lock(spinlock_t *lock, char *str, unsigned long caller);
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index ee4bdfc..5e94c05 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -28,6 +28,14 @@
 #define ARCH_SUN4C_SUN4 0
 #define ARCH_SUN4 0
 
+extern void mb(void);
+extern void rmb(void);
+extern void wmb(void);
+extern void membar_storeload(void);
+extern void membar_storeload_storestore(void);
+extern void membar_storeload_loadload(void);
+extern void membar_storestore_loadstore(void);
+
 #endif
 
 #define setipl(__new_ipl) \
@@ -78,16 +86,11 @@
 
 #define nop() 		__asm__ __volatile__ ("nop")
 
-#define membar(type)	__asm__ __volatile__ ("membar " type : : : "memory")
-#define mb()		\
-	membar("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad")
-#define rmb()		membar("#LoadLoad")
-#define wmb()		membar("#StoreStore")
 #define read_barrier_depends()		do { } while(0)
 #define set_mb(__var, __value) \
-	do { __var = __value; membar("#StoreLoad | #StoreStore"); } while(0)
+	do { __var = __value; membar_storeload_storestore(); } while(0)
 #define set_wmb(__var, __value) \
-	do { __var = __value; membar("#StoreStore"); } while(0)
+	do { __var = __value; wmb(); } while(0)
 
 #ifdef CONFIG_SMP
 #define smp_mb()	mb()