s390/bitops: implement cache friendly test_and_set_bit_lock

The generic implementation for test_and_set_bit_lock in include/asm-generic
uses the standard test_and_set_bit operation. This is done with either a
'csg' or a 'loag' instruction. For both version the cache line is fetched
exclusively, even if the bit is already set. The result is an increase in
cache traffic, for a contented lock this is a bad idea.

Acked-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 9b68e98..66a1cff 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -276,6 +276,28 @@
 	return (*addr >> (nr & 7)) & 1;
 }
 
+static inline int test_and_set_bit_lock(unsigned long nr,
+					volatile unsigned long *ptr)
+{
+	if (test_bit(nr, ptr))
+		return 1;
+	return test_and_set_bit(nr, ptr);
+}
+
+static inline void clear_bit_unlock(unsigned long nr,
+				    volatile unsigned long *ptr)
+{
+	smp_mb__before_atomic();
+	clear_bit(nr, ptr);
+}
+
+static inline void __clear_bit_unlock(unsigned long nr,
+				      volatile unsigned long *ptr)
+{
+	smp_mb();
+	__clear_bit(nr, ptr);
+}
+
 /*
  * Functions which use MSB0 bit numbering.
  * On an s390x system the bits are numbered:
@@ -446,7 +468,6 @@
 #include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
-#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>