[PATCH] x86: kernel irq balance doesn't work

On i386, kernel irq balance doesn't work.

1) In function do_irq_balance, after kernel finds the min_loaded cpu but
   before calling set_pending_irq to really pin the selected_irq to the
   target cpu, kernel does a cpus_and with irq_affinity[selected_irq].
   Later on, when the irq is acked, kernel would calls
   move_native_irq=>desc->handler->set_affinity to change the irq affinity.
    However, every function pointed by
   hw_interrupt_type->set_affinity(unsigned int irq, cpumask_t cpumask)
   always changes irq_affinity[irq] to cpumask.  Next time when recalling
   do_irq_balance, it has to do cpu_ands again with
   irq_affinity[selected_irq], but irq_affinity[selected_irq] already
   becomes one cpu selected by the first irq balance.

2) Function balance_irq in file arch/i386/kernel/io_apic.c has the same
   issue.

[akpm@osdl.org: cleanups]
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index d70f2ad..a62df3e 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -267,7 +267,7 @@
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>	/* time_after() */
  
-# ifdef CONFIG_BALANCED_IRQ_DEBUG
+#ifdef CONFIG_BALANCED_IRQ_DEBUG
 #  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
 #  define Dprintk(x...) do { TDprintk(x); } while (0)
 # else
@@ -275,10 +275,15 @@
 #  define Dprintk(x...) 
 # endif
 
-
 #define IRQBALANCE_CHECK_ARCH -999
-static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
-static int physical_balance = 0;
+#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
+#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
+#define BALANCED_IRQ_LESS_DELTA		(HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
 
 static struct irq_cpu_info {
 	unsigned long * last_irq;
@@ -297,12 +302,14 @@
 
 #define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
 
-#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
-#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
-#define BALANCED_IRQ_LESS_DELTA		(HZ)
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+	[0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
 
-static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	balance_irq_affinity[irq] = mask;
+}
 
 static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
 			unsigned long now, int direction)
@@ -340,7 +347,7 @@
 	if (irqbalance_disabled)
 		return; 
 
-	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
+	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
 		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
@@ -529,7 +536,9 @@
 		}
 	}
 
-	cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
+	cpus_and(allowed_mask,
+		cpu_online_map,
+		balance_irq_affinity[selected_irq]);
 	target_cpu_mask = cpumask_of_cpu(min_loaded);
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 42c9cd5..e8a07e7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -164,6 +164,14 @@
 
 #endif // CONFIG_SMP
 
+#ifdef CONFIG_IRQBALANCE
+extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
+#else
+static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+}
+#endif
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5ee..afacd6f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
+
 	/*
 	 * Save these away for later use. Re-progam when the
 	 * interrupt is pending
@@ -33,6 +35,7 @@
 #else
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }