[PATCH] x86: kernel irq balance doesn't work
On i386, kernel irq balance doesn't work.
1) In function do_irq_balance, after kernel finds the min_loaded cpu but
before calling set_pending_irq to really pin the selected_irq to the
target cpu, kernel does a cpus_and with irq_affinity[selected_irq].
Later on, when the irq is acked, kernel would calls
move_native_irq=>desc->handler->set_affinity to change the irq affinity.
However, every function pointed by
hw_interrupt_type->set_affinity(unsigned int irq, cpumask_t cpumask)
always changes irq_affinity[irq] to cpumask. Next time when recalling
do_irq_balance, it has to do cpu_ands again with
irq_affinity[selected_irq], but irq_affinity[selected_irq] already
becomes one cpu selected by the first irq balance.
2) Function balance_irq in file arch/i386/kernel/io_apic.c has the same
issue.
[akpm@osdl.org: cleanups]
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index d70f2ad..a62df3e 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -267,7 +267,7 @@
# include <linux/slab.h> /* kmalloc() */
# include <linux/timer.h> /* time_after() */
-# ifdef CONFIG_BALANCED_IRQ_DEBUG
+#ifdef CONFIG_BALANCED_IRQ_DEBUG
# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
# define Dprintk(x...) do { TDprintk(x); } while (0)
# else
@@ -275,10 +275,15 @@
# define Dprintk(x...)
# endif
-
#define IRQBALANCE_CHECK_ARCH -999
-static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
-static int physical_balance = 0;
+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA (HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
static struct irq_cpu_info {
unsigned long * last_irq;
@@ -297,12 +302,14 @@
#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
-#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
-#define BALANCED_IRQ_MORE_DELTA (HZ/10)
-#define BALANCED_IRQ_LESS_DELTA (HZ)
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+ [0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
-static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+ balance_irq_affinity[irq] = mask;
+}
static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
unsigned long now, int direction)
@@ -340,7 +347,7 @@
if (irqbalance_disabled)
return;
- cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
+ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
new_cpu = move(cpu, allowed_mask, now, 1);
if (cpu != new_cpu) {
set_pending_irq(irq, cpumask_of_cpu(new_cpu));
@@ -529,7 +536,9 @@
}
}
- cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
+ cpus_and(allowed_mask,
+ cpu_online_map,
+ balance_irq_affinity[selected_irq]);
target_cpu_mask = cpumask_of_cpu(min_loaded);
cpus_and(tmp, target_cpu_mask, allowed_mask);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 42c9cd5..e8a07e7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -164,6 +164,14 @@
#endif // CONFIG_SMP
+#ifdef CONFIG_IRQBALANCE
+extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
+#else
+static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+}
+#endif
+
extern int no_irq_affinity;
extern int noirqdebug_setup(char *str);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5ee..afacd6f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@
#ifdef CONFIG_GENERIC_PENDING_IRQ
void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
+ set_balance_irq_affinity(irq, mask_val);
+
/*
* Save these away for later use. Re-progam when the
* interrupt is pending
@@ -33,6 +35,7 @@
#else
void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
+ set_balance_irq_affinity(irq, mask_val);
irq_affinity[irq] = mask_val;
irq_desc[irq].handler->set_affinity(irq, mask_val);
}