s390/mm: fix local TLB flushing vs. detach of an mm address space

The local TLB flushing code keeps an additional mask in the mm.context,
the cpu_attach_mask. At the time a global flush of an address space is
done the cpu_attach_mask is copied to the mm_cpumask in order to avoid
future global flushes in case the mm is used by a single CPU only after
the flush.

Trouble is that the reset of the mm_cpumask is racy against the detach
of an mm address space by switch_mm. The current order is first the
global TLB flush and then the copy of the cpu_attach_mask to the
mm_cpumask. The order needs to be the other way around.

Cc: <stable@vger.kernel.org>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 72e9ca8..8823e35 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -103,7 +103,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	if (prev == next)
 		return;
 	cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
-	cpumask_set_cpu(cpu, mm_cpumask(next));
 	/* Clear old ASCE by loading the kernel ASCE. */
 	__ctl_load(S390_lowcore.kernel_asce, 1, 1);
 	__ctl_load(S390_lowcore.kernel_asce, 7, 7);
@@ -121,7 +120,7 @@ static inline void finish_arch_post_lock_switch(void)
 		preempt_disable();
 		while (atomic_read(&mm->context.flush_count))
 			cpu_relax();
-
+		cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 		if (mm->context.flush_mm)
 			__tlb_flush_mm(mm);
 		preempt_enable();
@@ -136,6 +135,7 @@ static inline void activate_mm(struct mm_struct *prev,
                                struct mm_struct *next)
 {
 	switch_mm(prev, next, current);
+	cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
 	set_user_asce(next);
 }