[SPARC64]: Consolidate MSI support code.

This also makes us use the MSI queues correctly.

Each MSI queue is serviced by a normal sun4u/sun4v INO interrupt
handler.  This handler runs the MSI queue and dispatches the
virtual interrupts indicated by arriving MSIs in that MSI queue.

All of the common logic is placed in pci_msi.c, with callbacks to
handle the PCI controller specific aspects of the operations.

This common infrastructure will make it much easier to add MSG
support.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 40d2f3a..112c46e 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -18,6 +18,7 @@
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o \
 			    pci_sun4v.o pci_sun4v_asm.o pci_fire.o
+obj-$(CONFIG_PCI_MSI)	+= pci_msi.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o hvtramp.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 7f5a4c7..045ab27 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -21,7 +21,6 @@
 #include <linux/seq_file.h>
 #include <linux/bootmem.h>
 #include <linux/irq.h>
-#include <linux/msi.h>
 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
@@ -92,39 +91,46 @@
 	unsigned int dev_handle;
 	unsigned int dev_ino;
 } virt_to_real_irq_table[NR_IRQS];
+static DEFINE_SPINLOCK(virt_irq_alloc_lock);
 
-static unsigned char virt_irq_alloc(unsigned int real_irq)
+unsigned char virt_irq_alloc(unsigned int real_irq)
 {
+	unsigned long flags;
 	unsigned char ent;
 
 	BUILD_BUG_ON(NR_IRQS >= 256);
 
+	spin_lock_irqsave(&virt_irq_alloc_lock, flags);
+
 	for (ent = 1; ent < NR_IRQS; ent++) {
 		if (!virt_to_real_irq_table[ent].irq)
 			break;
 	}
 	if (ent >= NR_IRQS) {
 		printk(KERN_ERR "IRQ: Out of virtual IRQs.\n");
-		return 0;
+		ent = 0;
+	} else {
+		virt_to_real_irq_table[ent].irq = real_irq;
 	}
 
-	virt_to_real_irq_table[ent].irq = real_irq;
+	spin_unlock_irqrestore(&virt_irq_alloc_lock, flags);
 
 	return ent;
 }
 
 #ifdef CONFIG_PCI_MSI
-static void virt_irq_free(unsigned int virt_irq)
+void virt_irq_free(unsigned int virt_irq)
 {
-	unsigned int real_irq;
+	unsigned long flags;
 
 	if (virt_irq >= NR_IRQS)
 		return;
 
-	real_irq = virt_to_real_irq_table[virt_irq].irq;
+	spin_lock_irqsave(&virt_irq_alloc_lock, flags);
+
 	virt_to_real_irq_table[virt_irq].irq = 0;
 
-	__bucket(real_irq)->virt_irq = 0;
+	spin_unlock_irqrestore(&virt_irq_alloc_lock, flags);
 }
 #endif
 
@@ -217,27 +223,8 @@
 	void		(*pre_handler)(unsigned int, void *, void *);
 	void		*pre_handler_arg1;
 	void		*pre_handler_arg2;
-
-	u32		msi;
 };
 
-void sparc64_set_msi(unsigned int virt_irq, u32 msi)
-{
-	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
-
-	if (data)
-		data->msi = msi;
-}
-
-u32 sparc64_get_msi(unsigned int virt_irq)
-{
-	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
-
-	if (data)
-		return data->msi;
-	return 0xffffffff;
-}
-
 static inline struct ino_bucket *virt_irq_to_bucket(unsigned int virt_irq)
 {
 	unsigned int real_irq = virt_to_real_irq(virt_irq);
@@ -405,32 +392,6 @@
 	}
 }
 
-#ifdef CONFIG_PCI_MSI
-static void sun4u_msi_enable(unsigned int virt_irq)
-{
-	sun4u_irq_enable(virt_irq);
-	unmask_msi_irq(virt_irq);
-}
-
-static void sun4u_msi_disable(unsigned int virt_irq)
-{
-	mask_msi_irq(virt_irq);
-	sun4u_irq_disable(virt_irq);
-}
-
-static void sun4v_msi_enable(unsigned int virt_irq)
-{
-	sun4v_irq_enable(virt_irq);
-	unmask_msi_irq(virt_irq);
-}
-
-static void sun4v_msi_disable(unsigned int virt_irq)
-{
-	mask_msi_irq(virt_irq);
-	sun4v_irq_disable(virt_irq);
-}
-#endif
-
 static void sun4v_irq_end(unsigned int virt_irq)
 {
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
@@ -585,39 +546,6 @@
 	.set_affinity	= sun4v_set_affinity,
 };
 
-static struct irq_chip sun4v_irq_ack = {
-	.typename	= "sun4v+ack",
-	.enable		= sun4v_irq_enable,
-	.disable	= sun4v_irq_disable,
-	.ack		= run_pre_handler,
-	.end		= sun4v_irq_end,
-	.set_affinity	= sun4v_set_affinity,
-};
-
-#ifdef CONFIG_PCI_MSI
-static struct irq_chip sun4u_msi = {
-	.typename	= "sun4u+msi",
-	.mask		= mask_msi_irq,
-	.unmask		= unmask_msi_irq,
-	.enable		= sun4u_msi_enable,
-	.disable	= sun4u_msi_disable,
-	.ack		= run_pre_handler,
-	.end		= sun4u_irq_end,
-	.set_affinity	= sun4u_set_affinity,
-};
-
-static struct irq_chip sun4v_msi = {
-	.typename	= "sun4v+msi",
-	.mask		= mask_msi_irq,
-	.unmask		= unmask_msi_irq,
-	.enable		= sun4v_msi_enable,
-	.disable	= sun4v_msi_disable,
-	.ack		= run_pre_handler,
-	.end		= sun4v_irq_end,
-	.set_affinity	= sun4v_set_affinity,
-};
-#endif
-
 static struct irq_chip sun4v_virq = {
 	.typename	= "vsun4v",
 	.enable		= sun4v_virq_enable,
@@ -626,42 +554,27 @@
 	.set_affinity	= sun4v_virt_set_affinity,
 };
 
-static struct irq_chip sun4v_virq_ack = {
-	.typename	= "vsun4v+ack",
-	.enable		= sun4v_virq_enable,
-	.disable	= sun4v_virq_disable,
-	.ack		= run_pre_handler,
-	.end		= sun4v_virq_end,
-	.set_affinity	= sun4v_virt_set_affinity,
-};
-
 void irq_install_pre_handler(int virt_irq,
 			     void (*func)(unsigned int, void *, void *),
 			     void *arg1, void *arg2)
 {
 	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
-	struct irq_chip *chip;
+	struct irq_chip *chip = get_irq_chip(virt_irq);
+
+	if (WARN_ON(chip == &sun4v_irq || chip == &sun4v_virq)) {
+		printk(KERN_ERR "IRQ: Trying to install pre-handler on "
+		       "sun4v irq %u\n", virt_irq);
+		return;
+	}
 
 	data->pre_handler = func;
 	data->pre_handler_arg1 = arg1;
 	data->pre_handler_arg2 = arg2;
 
-	chip = get_irq_chip(virt_irq);
-	if (chip == &sun4u_irq_ack ||
-	    chip == &sun4v_irq_ack ||
-	    chip == &sun4v_virq_ack
-#ifdef CONFIG_PCI_MSI
-	    || chip == &sun4u_msi
-	    || chip == &sun4v_msi
-#endif
-	    )
+	if (chip == &sun4u_irq_ack)
 		return;
 
-	chip = (chip == &sun4u_irq ?
-		&sun4u_irq_ack :
-		(chip == &sun4v_irq ?
-		 &sun4v_irq_ack : &sun4v_virq_ack));
-	set_irq_chip(virt_irq, chip);
+	set_irq_chip(virt_irq, &sun4u_irq_ack);
 }
 
 unsigned int build_irq(int inofixup, unsigned long iclr, unsigned long imap)
@@ -765,103 +678,6 @@
 	return virq;
 }
 
-#ifdef CONFIG_PCI_MSI
-unsigned int sun4v_build_msi(u32 devhandle, unsigned int *virt_irq_p,
-			     unsigned int msi_start, unsigned int msi_end)
-{
-	struct ino_bucket *bucket;
-	struct irq_handler_data *data;
-	unsigned long sysino;
-	unsigned int devino;
-
-	BUG_ON(tlb_type != hypervisor);
-
-	/* Find a free devino in the given range.  */
-	for (devino = msi_start; devino < msi_end; devino++) {
-		sysino = sun4v_devino_to_sysino(devhandle, devino);
-		bucket = &ivector_table[sysino];
-		if (!bucket->virt_irq)
-			break;
-	}
-	if (devino >= msi_end)
-		return -ENOSPC;
-
-	sysino = sun4v_devino_to_sysino(devhandle, devino);
-	bucket = &ivector_table[sysino];
-	bucket->virt_irq = virt_irq_alloc(__irq(bucket));
-	*virt_irq_p = bucket->virt_irq;
-	set_irq_chip(bucket->virt_irq, &sun4v_msi);
-
-	data = get_irq_chip_data(bucket->virt_irq);
-	if (unlikely(data))
-		return devino;
-
-	data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
-	if (unlikely(!data)) {
-		virt_irq_free(*virt_irq_p);
-		return -ENOMEM;
-	}
-	set_irq_chip_data(bucket->virt_irq, data);
-
-	data->imap = ~0UL;
-	data->iclr = ~0UL;
-
-	return devino;
-}
-
-void sun4v_destroy_msi(unsigned int virt_irq)
-{
-	virt_irq_free(virt_irq);
-}
-
-unsigned int sun4u_build_msi(u32 portid, unsigned int *virt_irq_p,
-			     unsigned int msi_start, unsigned int msi_end,
-			     unsigned long imap_base, unsigned long iclr_base)
-{
-	struct ino_bucket *bucket;
-	struct irq_handler_data *data;
-	unsigned long sysino;
-	unsigned int devino;
-
-	/* Find a free devino in the given range.  */
-	for (devino = msi_start; devino < msi_end; devino++) {
-		sysino = (portid << 6) | devino;
-		bucket = &ivector_table[sysino];
-		if (!bucket->virt_irq)
-			break;
-	}
-	if (devino >= msi_end)
-		return -ENOSPC;
-
-	sysino = (portid << 6) | devino;
-	bucket = &ivector_table[sysino];
-	bucket->virt_irq = virt_irq_alloc(__irq(bucket));
-	*virt_irq_p = bucket->virt_irq;
-	set_irq_chip(bucket->virt_irq, &sun4u_msi);
-
-	data = get_irq_chip_data(bucket->virt_irq);
-	if (unlikely(data))
-		return devino;
-
-	data = kzalloc(sizeof(struct irq_handler_data), GFP_ATOMIC);
-	if (unlikely(!data)) {
-		virt_irq_free(*virt_irq_p);
-		return -ENOMEM;
-	}
-	set_irq_chip_data(bucket->virt_irq, data);
-
-	data->imap = (imap_base + (devino * 0x8UL));
-	data->iclr = (iclr_base + (devino * 0x8UL));
-
-	return devino;
-}
-
-void sun4u_destroy_msi(unsigned int virt_irq)
-{
-	virt_irq_free(virt_irq);
-}
-#endif
-
 void ack_bad_irq(unsigned int virt_irq)
 {
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
diff --git a/arch/sparc64/kernel/pci_fire.c b/arch/sparc64/kernel/pci_fire.c
index 090f265..bcf6a5d 100644
--- a/arch/sparc64/kernel/pci_fire.c
+++ b/arch/sparc64/kernel/pci_fire.c
@@ -161,90 +161,92 @@
 #define MSI_64BIT_ADDR			0x034008UL
 #define  MSI_64BIT_ADDR_VAL		0xffffffffffff0000UL
 
-/* For now this just runs as a pre-handler for the real interrupt handler.
- * So we just walk through the queue and ACK all the entries, update the
- * head pointer, and return.
- *
- * In the longer term it would be nice to do something more integrated
- * wherein we can pass in some of this MSI info to the drivers.  This
- * would be most useful for PCIe fabric error messages, although we could
- * invoke those directly from the loop here in order to pass the info around.
- */
-static void pci_msi_prehandler(unsigned int ino, void *data1, void *data2)
+static int pci_fire_get_head(struct pci_pbm_info *pbm, unsigned long msiqid,
+			     unsigned long *head)
 {
-	unsigned long msiqid, orig_head, head, type_fmt, type;
-	struct pci_pbm_info *pbm = data1;
-	struct pci_msiq_entry *base, *ep;
-
-	msiqid = (unsigned long) data2;
-
-	head = fire_read(pbm->pbm_regs + EVENT_QUEUE_HEAD(msiqid));
-
-	orig_head = head;
-	base = (pbm->msi_queues + ((msiqid - pbm->msiq_first) * 8192));
-	ep = &base[head];
-	while ((ep->word0 & MSIQ_WORD0_FMT_TYPE) != 0) {
-		unsigned long msi_num;
-
-		type_fmt = ((ep->word0 & MSIQ_WORD0_FMT_TYPE) >>
-			    MSIQ_WORD0_FMT_TYPE_SHIFT);
-		type = (type_fmt >>3);
-		if (unlikely(type != MSIQ_TYPE_MSI32 &&
-			     type != MSIQ_TYPE_MSI64))
-			goto bad_type;
-
-		msi_num = ((ep->word0 & MSIQ_WORD0_DATA0) >>
-			   MSIQ_WORD0_DATA0_SHIFT);
-
-		fire_write(pbm->pbm_regs + MSI_CLEAR(msi_num),
-			   MSI_CLEAR_EQWR_N);
-
-		/* Clear the entry.  */
-		ep->word0 &= ~MSIQ_WORD0_FMT_TYPE;
-
-		/* Go to next entry in ring.  */
-		head++;
-		if (head >= pbm->msiq_ent_count)
-			head = 0;
-		ep = &base[head];
-	}
-
-	if (likely(head != orig_head)) {
-		/* ACK entries by updating head pointer.  */
-		fire_write(pbm->pbm_regs +
-			   EVENT_QUEUE_HEAD(msiqid),
-			   head);
-	}
-	return;
-
-bad_type:
-	printk(KERN_EMERG "MSI: Entry has bad type %lx\n", type);
-	return;
+	*head = fire_read(pbm->pbm_regs + EVENT_QUEUE_HEAD(msiqid));
+	return 0;
 }
 
-static int msi_bitmap_alloc(struct pci_pbm_info *pbm)
+static int pci_fire_dequeue_msi(struct pci_pbm_info *pbm, unsigned long msiqid,
+				unsigned long *head, unsigned long *msi)
 {
-	unsigned long size, bits_per_ulong;
+	unsigned long type_fmt, type, msi_num;
+	struct pci_msiq_entry *base, *ep;
 
-	bits_per_ulong = sizeof(unsigned long) * 8;
-	size = (pbm->msi_num + (bits_per_ulong - 1)) & ~(bits_per_ulong - 1);
-	size /= 8;
-	BUG_ON(size % sizeof(unsigned long));
+	base = (pbm->msi_queues + ((msiqid - pbm->msiq_first) * 8192));
+	ep = &base[*head];
 
-	pbm->msi_bitmap = kzalloc(size, GFP_KERNEL);
-	if (!pbm->msi_bitmap)
-		return -ENOMEM;
+	if ((ep->word0 & MSIQ_WORD0_FMT_TYPE) == 0)
+		return 0;
+
+	type_fmt = ((ep->word0 & MSIQ_WORD0_FMT_TYPE) >>
+		    MSIQ_WORD0_FMT_TYPE_SHIFT);
+	type = (type_fmt >> 3);
+	if (unlikely(type != MSIQ_TYPE_MSI32 &&
+		     type != MSIQ_TYPE_MSI64))
+		return -EINVAL;
+
+	*msi = msi_num = ((ep->word0 & MSIQ_WORD0_DATA0) >>
+			  MSIQ_WORD0_DATA0_SHIFT);
+
+	fire_write(pbm->pbm_regs + MSI_CLEAR(msi_num),
+		   MSI_CLEAR_EQWR_N);
+
+	/* Clear the entry.  */
+	ep->word0 &= ~MSIQ_WORD0_FMT_TYPE;
+
+	/* Go to next entry in ring.  */
+	(*head)++;
+	if (*head >= pbm->msiq_ent_count)
+		*head = 0;
+
+	return 1;
+}
+
+static int pci_fire_set_head(struct pci_pbm_info *pbm, unsigned long msiqid,
+			     unsigned long head)
+{
+	fire_write(pbm->pbm_regs + EVENT_QUEUE_HEAD(msiqid), head);
+	return 0;
+}
+
+static int pci_fire_msi_setup(struct pci_pbm_info *pbm, unsigned long msiqid,
+			      unsigned long msi, int is_msi64)
+{
+	u64 val;
+
+	val = fire_read(pbm->pbm_regs + MSI_MAP(msi));
+	val &= ~(MSI_MAP_EQNUM);
+	val |= msiqid;
+	fire_write(pbm->pbm_regs + MSI_MAP(msi), val);
+
+	fire_write(pbm->pbm_regs + MSI_CLEAR(msi),
+		   MSI_CLEAR_EQWR_N);
+
+	val = fire_read(pbm->pbm_regs + MSI_MAP(msi));
+	val |= MSI_MAP_VALID;
+	fire_write(pbm->pbm_regs + MSI_MAP(msi), val);
 
 	return 0;
 }
 
-static void msi_bitmap_free(struct pci_pbm_info *pbm)
+static int pci_fire_msi_teardown(struct pci_pbm_info *pbm, unsigned long msi)
 {
-	kfree(pbm->msi_bitmap);
-	pbm->msi_bitmap = NULL;
+	unsigned long msiqid;
+	u64 val;
+
+	val = fire_read(pbm->pbm_regs + MSI_MAP(msi));
+	msiqid = (val & MSI_MAP_EQNUM);
+
+	val &= ~MSI_MAP_VALID;
+
+	fire_write(pbm->pbm_regs + MSI_MAP(msi), val);
+
+	return 0;
 }
 
-static int msi_queue_alloc(struct pci_pbm_info *pbm)
+static int pci_fire_msiq_alloc(struct pci_pbm_info *pbm)
 {
 	unsigned long pages, order, i;
 
@@ -279,241 +281,65 @@
 	return 0;
 }
 
-static int alloc_msi(struct pci_pbm_info *pbm)
+static void pci_fire_msiq_free(struct pci_pbm_info *pbm)
 {
-	int i;
+	unsigned long pages, order;
 
-	for (i = 0; i < pbm->msi_num; i++) {
-		if (!test_and_set_bit(i, pbm->msi_bitmap))
-			return i + pbm->msi_first;
-	}
+	order = get_order(512 * 1024);
+	pages = (unsigned long) pbm->msi_queues;
 
-	return -ENOENT;
+	free_pages(pages, order);
+
+	pbm->msi_queues = NULL;
 }
 
-static void free_msi(struct pci_pbm_info *pbm, int msi_num)
+static int pci_fire_msiq_build_irq(struct pci_pbm_info *pbm,
+				   unsigned long msiqid,
+				   unsigned long devino)
 {
-	msi_num -= pbm->msi_first;
-	clear_bit(msi_num, pbm->msi_bitmap);
-}
-
-static int pci_setup_msi_irq(unsigned int *virt_irq_p,
-			     struct pci_dev *pdev,
-			     struct msi_desc *entry)
-{
-	struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
-	unsigned long devino, msiqid, cregs, imap_off;
-	struct msi_msg msg;
-	int msi_num, err;
+	unsigned long cregs = (unsigned long) pbm->pbm_regs;
+	unsigned long imap_reg, iclr_reg, int_ctrlr;
+	unsigned int virt_irq;
+	int fixup;
 	u64 val;
 
-	*virt_irq_p = 0;
+	imap_reg = cregs + (0x001000UL + (devino * 0x08UL));
+	iclr_reg = cregs + (0x001400UL + (devino * 0x08UL));
 
-	msi_num = alloc_msi(pbm);
-	if (msi_num < 0)
-		return msi_num;
+	/* XXX iterate amongst the 4 IRQ controllers XXX */
+	int_ctrlr = (1UL << 6);
 
-	cregs = (unsigned long) pbm->pbm_regs;
+	val = fire_read(imap_reg);
+	val |= (1UL << 63) | int_ctrlr;
+	fire_write(imap_reg, val);
 
-	err = sun4u_build_msi(pbm->portid, virt_irq_p,
-			      pbm->msiq_first_devino,
-			      (pbm->msiq_first_devino +
-			       pbm->msiq_num),
-			      cregs + 0x001000UL,
-			      cregs + 0x001400UL);
-	if (err < 0)
-		goto out_err;
-	devino = err;
+	fixup = ((pbm->portid << 6) | devino) - int_ctrlr;
 
-	imap_off = 0x001000UL + (devino * 0x8UL);
-
-	val = fire_read(pbm->pbm_regs + imap_off);
-	val |= (1UL << 63) | (1UL << 6);
-	fire_write(pbm->pbm_regs + imap_off, val);
-
-	msiqid = ((devino - pbm->msiq_first_devino) +
-		  pbm->msiq_first);
+	virt_irq = build_irq(fixup, iclr_reg, imap_reg);
+	if (!virt_irq)
+		return -ENOMEM;
 
 	fire_write(pbm->pbm_regs +
 		   EVENT_QUEUE_CONTROL_SET(msiqid),
 		   EVENT_QUEUE_CONTROL_SET_EN);
 
-	val = fire_read(pbm->pbm_regs + MSI_MAP(msi_num));
-	val &= ~(MSI_MAP_EQNUM);
-	val |= msiqid;
-	fire_write(pbm->pbm_regs + MSI_MAP(msi_num), val);
-
-	fire_write(pbm->pbm_regs + MSI_CLEAR(msi_num),
-		   MSI_CLEAR_EQWR_N);
-
-	val = fire_read(pbm->pbm_regs + MSI_MAP(msi_num));
-	val |= MSI_MAP_VALID;
-	fire_write(pbm->pbm_regs + MSI_MAP(msi_num), val);
-
-	sparc64_set_msi(*virt_irq_p, msi_num);
-
-	if (entry->msi_attrib.is_64) {
-		msg.address_hi = pbm->msi64_start >> 32;
-		msg.address_lo = pbm->msi64_start & 0xffffffff;
-	} else {
-		msg.address_hi = 0;
-		msg.address_lo = pbm->msi32_start;
-	}
-	msg.data = msi_num;
-
-	set_irq_msi(*virt_irq_p, entry);
-	write_msi_msg(*virt_irq_p, &msg);
-
-	irq_install_pre_handler(*virt_irq_p,
-				pci_msi_prehandler,
-				pbm, (void *) msiqid);
-
-	return 0;
-
-out_err:
-	free_msi(pbm, msi_num);
-	return err;
+	return virt_irq;
 }
 
-static void pci_teardown_msi_irq(unsigned int virt_irq,
-				 struct pci_dev *pdev)
-{
-	struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
-	unsigned long msiqid, msi_num;
-	u64 val;
-
-	msi_num = sparc64_get_msi(virt_irq);
-
-	val = fire_read(pbm->pbm_regs + MSI_MAP(msi_num));
-
-	msiqid = (val & MSI_MAP_EQNUM);
-
-	val &= ~MSI_MAP_VALID;
-	fire_write(pbm->pbm_regs + MSI_MAP(msi_num), val);
-
-	fire_write(pbm->pbm_regs + EVENT_QUEUE_CONTROL_CLEAR(msiqid),
-		   EVENT_QUEUE_CONTROL_CLEAR_DIS);
-
-	free_msi(pbm, msi_num);
-
-	/* The sun4u_destroy_msi() will liberate the devino and thus the MSIQ
-	 * allocation.
-	 */
-	sun4u_destroy_msi(virt_irq);
-}
+static const struct sparc64_msiq_ops pci_fire_msiq_ops = {
+	.get_head	=	pci_fire_get_head,
+	.dequeue_msi	=	pci_fire_dequeue_msi,
+	.set_head	=	pci_fire_set_head,
+	.msi_setup	=	pci_fire_msi_setup,
+	.msi_teardown	=	pci_fire_msi_teardown,
+	.msiq_alloc	=	pci_fire_msiq_alloc,
+	.msiq_free	=	pci_fire_msiq_free,
+	.msiq_build_irq	=	pci_fire_msiq_build_irq,
+};
 
 static void pci_fire_msi_init(struct pci_pbm_info *pbm)
 {
-	const u32 *val;
-	int len;
-
-	val = of_get_property(pbm->prom_node, "#msi-eqs", &len);
-	if (!val || len != 4)
-		goto no_msi;
-	pbm->msiq_num = *val;
-	if (pbm->msiq_num) {
-		const struct msiq_prop {
-			u32 first_msiq;
-			u32 num_msiq;
-			u32 first_devino;
-		} *mqp;
-		const struct msi_range_prop {
-			u32 first_msi;
-			u32 num_msi;
-		} *mrng;
-		const struct addr_range_prop {
-			u32 msi32_high;
-			u32 msi32_low;
-			u32 msi32_len;
-			u32 msi64_high;
-			u32 msi64_low;
-			u32 msi64_len;
-		} *arng;
-
-		val = of_get_property(pbm->prom_node, "msi-eq-size", &len);
-		if (!val || len != 4)
-			goto no_msi;
-
-		pbm->msiq_ent_count = *val;
-
-		mqp = of_get_property(pbm->prom_node,
-				      "msi-eq-to-devino", &len);
-		if (!mqp)
-			mqp = of_get_property(pbm->prom_node,
-					      "msi-eq-devino", &len);
-		if (!mqp || len != sizeof(struct msiq_prop))
-			goto no_msi;
-
-		pbm->msiq_first = mqp->first_msiq;
-		pbm->msiq_first_devino = mqp->first_devino;
-
-		val = of_get_property(pbm->prom_node, "#msi", &len);
-		if (!val || len != 4)
-			goto no_msi;
-		pbm->msi_num = *val;
-
-		mrng = of_get_property(pbm->prom_node, "msi-ranges", &len);
-		if (!mrng || len != sizeof(struct msi_range_prop))
-			goto no_msi;
-		pbm->msi_first = mrng->first_msi;
-
-		val = of_get_property(pbm->prom_node, "msi-data-mask", &len);
-		if (!val || len != 4)
-			goto no_msi;
-		pbm->msi_data_mask = *val;
-
-		val = of_get_property(pbm->prom_node, "msix-data-width", &len);
-		if (!val || len != 4)
-			goto no_msi;
-		pbm->msix_data_width = *val;
-
-		arng = of_get_property(pbm->prom_node, "msi-address-ranges",
-				       &len);
-		if (!arng || len != sizeof(struct addr_range_prop))
-			goto no_msi;
-		pbm->msi32_start = ((u64)arng->msi32_high << 32) |
-			(u64) arng->msi32_low;
-		pbm->msi64_start = ((u64)arng->msi64_high << 32) |
-			(u64) arng->msi64_low;
-		pbm->msi32_len = arng->msi32_len;
-		pbm->msi64_len = arng->msi64_len;
-
-		if (msi_bitmap_alloc(pbm))
-			goto no_msi;
-
-		if (msi_queue_alloc(pbm)) {
-			msi_bitmap_free(pbm);
-			goto no_msi;
-		}
-
-		printk(KERN_INFO "%s: MSI Queue first[%u] num[%u] count[%u] "
-		       "devino[0x%x]\n",
-		       pbm->name,
-		       pbm->msiq_first, pbm->msiq_num,
-		       pbm->msiq_ent_count,
-		       pbm->msiq_first_devino);
-		printk(KERN_INFO "%s: MSI first[%u] num[%u] mask[0x%x] "
-		       "width[%u]\n",
-		       pbm->name,
-		       pbm->msi_first, pbm->msi_num, pbm->msi_data_mask,
-		       pbm->msix_data_width);
-		printk(KERN_INFO "%s: MSI addr32[0x%lx:0x%x] "
-		       "addr64[0x%lx:0x%x]\n",
-		       pbm->name,
-		       pbm->msi32_start, pbm->msi32_len,
-		       pbm->msi64_start, pbm->msi64_len);
-		printk(KERN_INFO "%s: MSI queues at RA [%016lx]\n",
-		       pbm->name,
-		       __pa(pbm->msi_queues));
-	}
-	pbm->setup_msi_irq = pci_setup_msi_irq;
-	pbm->teardown_msi_irq = pci_teardown_msi_irq;
-
-	return;
-
-no_msi:
-	pbm->msiq_num = 0;
-	printk(KERN_INFO "%s: No MSI support.\n", pbm->name);
+	sparc64_pbm_msi_init(pbm, &pci_fire_msiq_ops);
 }
 #else /* CONFIG_PCI_MSI */
 static void pci_fire_msi_init(struct pci_pbm_info *pbm)
diff --git a/arch/sparc64/kernel/pci_impl.h b/arch/sparc64/kernel/pci_impl.h
index f660c2b..ccbb188 100644
--- a/arch/sparc64/kernel/pci_impl.h
+++ b/arch/sparc64/kernel/pci_impl.h
@@ -29,6 +29,33 @@
 #define PCI_STC_FLUSHFLAG_SET(STC) \
 	(*((STC)->strbuf_flushflag) != 0UL)
 
+#ifdef CONFIG_PCI_MSI
+struct pci_pbm_info;
+struct sparc64_msiq_ops {
+	int (*get_head)(struct pci_pbm_info *pbm, unsigned long msiqid,
+			unsigned long *head);
+	int (*dequeue_msi)(struct pci_pbm_info *pbm, unsigned long msiqid,
+			   unsigned long *head, unsigned long *msi);
+	int (*set_head)(struct pci_pbm_info *pbm, unsigned long msiqid,
+			unsigned long head);
+	int (*msi_setup)(struct pci_pbm_info *pbm, unsigned long msiqid,
+			 unsigned long msi, int is_msi64);
+	int (*msi_teardown)(struct pci_pbm_info *pbm, unsigned long msi);
+	int (*msiq_alloc)(struct pci_pbm_info *pbm);
+	void (*msiq_free)(struct pci_pbm_info *pbm);
+	int (*msiq_build_irq)(struct pci_pbm_info *pbm, unsigned long msiqid,
+			      unsigned long devino);
+};
+
+extern void sparc64_pbm_msi_init(struct pci_pbm_info *pbm,
+				 const struct sparc64_msiq_ops *ops);
+
+struct sparc64_msiq_cookie {
+	struct pci_pbm_info *pbm;
+	unsigned long msiqid;
+};
+#endif
+
 struct pci_controller_info;
 
 struct pci_pbm_info {
@@ -90,6 +117,8 @@
 	u32				msiq_ent_count;
 	u32				msiq_first;
 	u32				msiq_first_devino;
+	u32				msiq_rotor;
+	struct sparc64_msiq_cookie	*msiq_irq_cookies;
 	u32				msi_num;
 	u32				msi_first;
 	u32				msi_data_mask;
@@ -100,9 +129,11 @@
 	u32				msi64_len;
 	void				*msi_queues;
 	unsigned long			*msi_bitmap;
+	unsigned int			*msi_irq_table;
 	int (*setup_msi_irq)(unsigned int *virt_irq_p, struct pci_dev *pdev,
 			     struct msi_desc *entry);
 	void (*teardown_msi_irq)(unsigned int virt_irq, struct pci_dev *pdev);
+	const struct sparc64_msiq_ops	*msi_ops;
 #endif /* !(CONFIG_PCI_MSI) */
 
 	/* This PBM's streaming buffer. */
diff --git a/arch/sparc64/kernel/pci_msi.c b/arch/sparc64/kernel/pci_msi.c
new file mode 100644
index 0000000..0fa33b1
--- /dev/null
+++ b/arch/sparc64/kernel/pci_msi.c
@@ -0,0 +1,433 @@
+/* pci_msi.c: Sparc64 MSI support common layer.
+ *
+ * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+
+#include "pci_impl.h"
+
+static irqreturn_t sparc64_msiq_interrupt(int irq, void *cookie)
+{
+	struct sparc64_msiq_cookie *msiq_cookie = cookie;
+	struct pci_pbm_info *pbm = msiq_cookie->pbm;
+	unsigned long msiqid = msiq_cookie->msiqid;
+	const struct sparc64_msiq_ops *ops;
+	unsigned long orig_head, head;
+	int err;
+
+	ops = pbm->msi_ops;
+
+	err = ops->get_head(pbm, msiqid, &head);
+	if (unlikely(err < 0))
+		goto err_get_head;
+
+	orig_head = head;
+	for (;;) {
+		unsigned long msi;
+
+		err = ops->dequeue_msi(pbm, msiqid, &head, &msi);
+		if (likely(err > 0))
+			__do_IRQ(pbm->msi_irq_table[msi - pbm->msi_first]);
+
+		if (unlikely(err < 0))
+			goto err_dequeue;
+
+		if (err == 0)
+			break;
+	}
+	if (likely(head != orig_head)) {
+		err = ops->set_head(pbm, msiqid, head);
+		if (unlikely(err < 0))
+			goto err_set_head;
+	}
+	return IRQ_HANDLED;
+
+err_get_head:
+	printk(KERN_EMERG "MSI: Get head on msiqid[%lu] gives error %d\n",
+	       msiqid, err);
+	goto err_out;
+
+err_dequeue:
+	printk(KERN_EMERG "MSI: Dequeue head[%lu] from msiqid[%lu] "
+	       "gives error %d\n",
+	       head, msiqid, err);
+	goto err_out;
+
+err_set_head:
+	printk(KERN_EMERG "MSI: Set head[%lu] on msiqid[%lu] "
+	       "gives error %d\n",
+	       head, msiqid, err);
+	goto err_out;
+
+err_out:
+	return IRQ_NONE;
+}
+
+static u32 pick_msiq(struct pci_pbm_info *pbm)
+{
+	static DEFINE_SPINLOCK(rotor_lock);
+	unsigned long flags;
+	u32 ret, rotor;
+
+	spin_lock_irqsave(&rotor_lock, flags);
+
+	rotor = pbm->msiq_rotor;
+	ret = pbm->msiq_first + rotor;
+
+	if (++rotor >= pbm->msiq_num)
+		rotor = 0;
+	pbm->msiq_rotor = rotor;
+
+	spin_unlock_irqrestore(&rotor_lock, flags);
+
+	return ret;
+}
+
+
+static int alloc_msi(struct pci_pbm_info *pbm)
+{
+	int i;
+
+	for (i = 0; i < pbm->msi_num; i++) {
+		if (!test_and_set_bit(i, pbm->msi_bitmap))
+			return i + pbm->msi_first;
+	}
+
+	return -ENOENT;
+}
+
+static void free_msi(struct pci_pbm_info *pbm, int msi_num)
+{
+	msi_num -= pbm->msi_first;
+	clear_bit(msi_num, pbm->msi_bitmap);
+}
+
+static struct irq_chip msi_irq = {
+	.typename	= "PCI-MSI",
+	.mask		= mask_msi_irq,
+	.unmask		= unmask_msi_irq,
+	.enable		= unmask_msi_irq,
+	.disable	= mask_msi_irq,
+	/* XXX affinity XXX */
+};
+
+int sparc64_setup_msi_irq(unsigned int *virt_irq_p,
+			  struct pci_dev *pdev,
+			  struct msi_desc *entry)
+{
+	struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
+	const struct sparc64_msiq_ops *ops = pbm->msi_ops;
+	struct msi_msg msg;
+	int msi, err;
+	u32 msiqid;
+
+	*virt_irq_p = virt_irq_alloc(~0);
+	err = -ENOMEM;
+	if (!*virt_irq_p)
+		goto out_err;
+
+	set_irq_chip(*virt_irq_p, &msi_irq);
+
+	err = alloc_msi(pbm);
+	if (unlikely(err < 0))
+		goto out_virt_irq_free;
+
+	msi = err;
+
+	msiqid = pick_msiq(pbm);
+
+	err = ops->msi_setup(pbm, msiqid, msi,
+			     (entry->msi_attrib.is_64 ? 1 : 0));
+	if (err)
+		goto out_msi_free;
+
+	pbm->msi_irq_table[msi - pbm->msi_first] = *virt_irq_p;
+
+	if (entry->msi_attrib.is_64) {
+		msg.address_hi = pbm->msi64_start >> 32;
+		msg.address_lo = pbm->msi64_start & 0xffffffff;
+	} else {
+		msg.address_hi = 0;
+		msg.address_lo = pbm->msi32_start;
+	}
+	msg.data = msi;
+
+	set_irq_msi(*virt_irq_p, entry);
+	write_msi_msg(*virt_irq_p, &msg);
+
+	return 0;
+
+out_msi_free:
+	free_msi(pbm, msi);
+
+out_virt_irq_free:
+	set_irq_chip(*virt_irq_p, NULL);
+	virt_irq_free(*virt_irq_p);
+	*virt_irq_p = 0;
+
+out_err:
+	return err;
+}
+
+void sparc64_teardown_msi_irq(unsigned int virt_irq,
+			      struct pci_dev *pdev)
+{
+	struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
+	const struct sparc64_msiq_ops *ops = pbm->msi_ops;
+	unsigned int msi_num;
+	int i, err;
+
+	for (i = 0; i < pbm->msi_num; i++) {
+		if (pbm->msi_irq_table[i] == virt_irq)
+			break;
+	}
+	if (i >= pbm->msi_num) {
+		printk(KERN_ERR "%s: teardown: No MSI for irq %u\n",
+		       pbm->name, virt_irq);
+		return;
+	}
+
+	msi_num = pbm->msi_first + i;
+	pbm->msi_irq_table[i] = ~0U;
+
+	err = ops->msi_teardown(pbm, msi_num);
+	if (err) {
+		printk(KERN_ERR "%s: teardown: ops->teardown() on MSI %u, "
+		       "irq %u, gives error %d\n",
+		       pbm->name, msi_num, virt_irq, err);
+		return;
+	}
+
+	free_msi(pbm, msi_num);
+
+	set_irq_chip(virt_irq, NULL);
+	virt_irq_free(virt_irq);
+}
+
+static int msi_bitmap_alloc(struct pci_pbm_info *pbm)
+{
+	unsigned long size, bits_per_ulong;
+
+	bits_per_ulong = sizeof(unsigned long) * 8;
+	size = (pbm->msi_num + (bits_per_ulong - 1)) & ~(bits_per_ulong - 1);
+	size /= 8;
+	BUG_ON(size % sizeof(unsigned long));
+
+	pbm->msi_bitmap = kzalloc(size, GFP_KERNEL);
+	if (!pbm->msi_bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void msi_bitmap_free(struct pci_pbm_info *pbm)
+{
+	kfree(pbm->msi_bitmap);
+	pbm->msi_bitmap = NULL;
+}
+
+static int msi_table_alloc(struct pci_pbm_info *pbm)
+{
+	int size, i;
+
+	size = pbm->msiq_num * sizeof(struct sparc64_msiq_cookie);
+	pbm->msiq_irq_cookies = kzalloc(size, GFP_KERNEL);
+	if (!pbm->msiq_irq_cookies)
+		return -ENOMEM;
+
+	for (i = 0; i < pbm->msiq_num; i++) {
+		struct sparc64_msiq_cookie *p;
+
+		p = &pbm->msiq_irq_cookies[i];
+		p->pbm = pbm;
+		p->msiqid = pbm->msiq_first + i;
+	}
+
+	size = pbm->msi_num * sizeof(unsigned int);
+	pbm->msi_irq_table = kzalloc(size, GFP_KERNEL);
+	if (!pbm->msi_irq_table) {
+		kfree(pbm->msiq_irq_cookies);
+		pbm->msiq_irq_cookies = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void msi_table_free(struct pci_pbm_info *pbm)
+{
+	kfree(pbm->msiq_irq_cookies);
+	pbm->msiq_irq_cookies = NULL;
+
+	kfree(pbm->msi_irq_table);
+	pbm->msi_irq_table = NULL;
+}
+
+static int bringup_one_msi_queue(struct pci_pbm_info *pbm,
+				 const struct sparc64_msiq_ops *ops,
+				 unsigned long msiqid,
+				 unsigned long devino)
+{
+	int irq = ops->msiq_build_irq(pbm, msiqid, devino);
+	int err;
+
+	if (irq < 0)
+		return irq;
+
+	err = request_irq(irq, sparc64_msiq_interrupt, 0,
+			  "MSIQ",
+			  &pbm->msiq_irq_cookies[msiqid - pbm->msiq_first]);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int sparc64_bringup_msi_queues(struct pci_pbm_info *pbm,
+				      const struct sparc64_msiq_ops *ops)
+{
+	int i;
+
+	for (i = 0; i < pbm->msiq_num; i++) {
+		unsigned long msiqid = i + pbm->msiq_first;
+		unsigned long devino = i + pbm->msiq_first_devino;
+		int err;
+
+		err = bringup_one_msi_queue(pbm, ops, msiqid, devino);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+void sparc64_pbm_msi_init(struct pci_pbm_info *pbm,
+			  const struct sparc64_msiq_ops *ops)
+{
+	const u32 *val;
+	int len;
+
+	val = of_get_property(pbm->prom_node, "#msi-eqs", &len);
+	if (!val || len != 4)
+		goto no_msi;
+	pbm->msiq_num = *val;
+	if (pbm->msiq_num) {
+		const struct msiq_prop {
+			u32 first_msiq;
+			u32 num_msiq;
+			u32 first_devino;
+		} *mqp;
+		const struct msi_range_prop {
+			u32 first_msi;
+			u32 num_msi;
+		} *mrng;
+		const struct addr_range_prop {
+			u32 msi32_high;
+			u32 msi32_low;
+			u32 msi32_len;
+			u32 msi64_high;
+			u32 msi64_low;
+			u32 msi64_len;
+		} *arng;
+
+		val = of_get_property(pbm->prom_node, "msi-eq-size", &len);
+		if (!val || len != 4)
+			goto no_msi;
+
+		pbm->msiq_ent_count = *val;
+
+		mqp = of_get_property(pbm->prom_node,
+				      "msi-eq-to-devino", &len);
+		if (!mqp)
+			mqp = of_get_property(pbm->prom_node,
+					      "msi-eq-devino", &len);
+		if (!mqp || len != sizeof(struct msiq_prop))
+			goto no_msi;
+
+		pbm->msiq_first = mqp->first_msiq;
+		pbm->msiq_first_devino = mqp->first_devino;
+
+		val = of_get_property(pbm->prom_node, "#msi", &len);
+		if (!val || len != 4)
+			goto no_msi;
+		pbm->msi_num = *val;
+
+		mrng = of_get_property(pbm->prom_node, "msi-ranges", &len);
+		if (!mrng || len != sizeof(struct msi_range_prop))
+			goto no_msi;
+		pbm->msi_first = mrng->first_msi;
+
+		val = of_get_property(pbm->prom_node, "msi-data-mask", &len);
+		if (!val || len != 4)
+			goto no_msi;
+		pbm->msi_data_mask = *val;
+
+		val = of_get_property(pbm->prom_node, "msix-data-width", &len);
+		if (!val || len != 4)
+			goto no_msi;
+		pbm->msix_data_width = *val;
+
+		arng = of_get_property(pbm->prom_node, "msi-address-ranges",
+				       &len);
+		if (!arng || len != sizeof(struct addr_range_prop))
+			goto no_msi;
+		pbm->msi32_start = ((u64)arng->msi32_high << 32) |
+			(u64) arng->msi32_low;
+		pbm->msi64_start = ((u64)arng->msi64_high << 32) |
+			(u64) arng->msi64_low;
+		pbm->msi32_len = arng->msi32_len;
+		pbm->msi64_len = arng->msi64_len;
+
+		if (msi_bitmap_alloc(pbm))
+			goto no_msi;
+
+		if (msi_table_alloc(pbm)) {
+			msi_bitmap_free(pbm);
+			goto no_msi;
+		}
+
+		if (ops->msiq_alloc(pbm)) {
+			msi_table_free(pbm);
+			msi_bitmap_free(pbm);
+			goto no_msi;
+		}
+
+		if (sparc64_bringup_msi_queues(pbm, ops)) {
+			ops->msiq_free(pbm);
+			msi_table_free(pbm);
+			msi_bitmap_free(pbm);
+			goto no_msi;
+		}
+
+		printk(KERN_INFO "%s: MSI Queue first[%u] num[%u] count[%u] "
+		       "devino[0x%x]\n",
+		       pbm->name,
+		       pbm->msiq_first, pbm->msiq_num,
+		       pbm->msiq_ent_count,
+		       pbm->msiq_first_devino);
+		printk(KERN_INFO "%s: MSI first[%u] num[%u] mask[0x%x] "
+		       "width[%u]\n",
+		       pbm->name,
+		       pbm->msi_first, pbm->msi_num, pbm->msi_data_mask,
+		       pbm->msix_data_width);
+		printk(KERN_INFO "%s: MSI addr32[0x%lx:0x%x] "
+		       "addr64[0x%lx:0x%x]\n",
+		       pbm->name,
+		       pbm->msi32_start, pbm->msi32_len,
+		       pbm->msi64_start, pbm->msi64_len);
+		printk(KERN_INFO "%s: MSI queues at RA [%016lx]\n",
+		       pbm->name,
+		       __pa(pbm->msi_queues));
+
+		pbm->msi_ops = ops;
+		pbm->setup_msi_irq = sparc64_setup_msi_irq;
+		pbm->teardown_msi_irq = sparc64_teardown_msi_irq;
+	}
+	return;
+
+no_msi:
+	pbm->msiq_num = 0;
+	printk(KERN_INFO "%s: No MSI support.\n", pbm->name);
+}
diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c
index da724b1..97c45b2 100644
--- a/arch/sparc64/kernel/pci_sun4v.c
+++ b/arch/sparc64/kernel/pci_sun4v.c
@@ -748,111 +748,102 @@
 	u64		reserved2;
 };
 
-/* For now this just runs as a pre-handler for the real interrupt handler.
- * So we just walk through the queue and ACK all the entries, update the
- * head pointer, and return.
- *
- * In the longer term it would be nice to do something more integrated
- * wherein we can pass in some of this MSI info to the drivers.  This
- * would be most useful for PCIe fabric error messages, although we could
- * invoke those directly from the loop here in order to pass the info around.
- */
-static void pci_sun4v_msi_prehandler(unsigned int ino, void *data1, void *data2)
+static int pci_sun4v_get_head(struct pci_pbm_info *pbm, unsigned long msiqid,
+			      unsigned long *head)
 {
-	struct pci_pbm_info *pbm = data1;
-	struct pci_sun4v_msiq_entry *base, *ep;
-	unsigned long msiqid, orig_head, head, type, err;
+	unsigned long err, limit;
 
-	msiqid = (unsigned long) data2;
-
-	head = 0xdeadbeef;
-	err = pci_sun4v_msiq_gethead(pbm->devhandle, msiqid, &head);
+	err = pci_sun4v_msiq_gethead(pbm->devhandle, msiqid, head);
 	if (unlikely(err))
-		goto hv_error_get;
+		return -ENXIO;
 
-	if (unlikely(head >= (pbm->msiq_ent_count * sizeof(struct pci_sun4v_msiq_entry))))
-		goto bad_offset;
-
-	head /= sizeof(struct pci_sun4v_msiq_entry);
-	orig_head = head;
-	base = (pbm->msi_queues + ((msiqid - pbm->msiq_first) *
-				   (pbm->msiq_ent_count *
-				    sizeof(struct pci_sun4v_msiq_entry))));
-	ep = &base[head];
-	while ((ep->version_type & MSIQ_TYPE_MASK) != 0) {
-		type = (ep->version_type & MSIQ_TYPE_MASK) >> MSIQ_TYPE_SHIFT;
-		if (unlikely(type != MSIQ_TYPE_MSI32 &&
-			     type != MSIQ_TYPE_MSI64))
-			goto bad_type;
-
-		pci_sun4v_msi_setstate(pbm->devhandle,
-				       ep->msi_data /* msi_num */,
-				       HV_MSISTATE_IDLE);
-
-		/* Clear the entry.  */
-		ep->version_type &= ~MSIQ_TYPE_MASK;
-
-		/* Go to next entry in ring.  */
-		head++;
-		if (head >= pbm->msiq_ent_count)
-			head = 0;
-		ep = &base[head];
-	}
-
-	if (likely(head != orig_head)) {
-		/* ACK entries by updating head pointer.  */
-		head *= sizeof(struct pci_sun4v_msiq_entry);
-		err = pci_sun4v_msiq_sethead(pbm->devhandle, msiqid, head);
-		if (unlikely(err))
-			goto hv_error_set;
-	}
-	return;
-
-hv_error_set:
-	printk(KERN_EMERG "MSI: Hypervisor set head gives error %lu\n", err);
-	goto hv_error_cont;
-
-hv_error_get:
-	printk(KERN_EMERG "MSI: Hypervisor get head gives error %lu\n", err);
-
-hv_error_cont:
-	printk(KERN_EMERG "MSI: devhandle[%x] msiqid[%lx] head[%lu]\n",
-	       pbm->devhandle, msiqid, head);
-	return;
-
-bad_offset:
-	printk(KERN_EMERG "MSI: Hypervisor gives bad offset %lx max(%lx)\n",
-	       head, pbm->msiq_ent_count * sizeof(struct pci_sun4v_msiq_entry));
-	return;
-
-bad_type:
-	printk(KERN_EMERG "MSI: Entry has bad type %lx\n", type);
-	return;
-}
-
-static int msi_bitmap_alloc(struct pci_pbm_info *pbm)
-{
-	unsigned long size, bits_per_ulong;
-
-	bits_per_ulong = sizeof(unsigned long) * 8;
-	size = (pbm->msi_num + (bits_per_ulong - 1)) & ~(bits_per_ulong - 1);
-	size /= 8;
-	BUG_ON(size % sizeof(unsigned long));
-
-	pbm->msi_bitmap = kzalloc(size, GFP_KERNEL);
-	if (!pbm->msi_bitmap)
-		return -ENOMEM;
+	limit = pbm->msiq_ent_count * sizeof(struct pci_sun4v_msiq_entry);
+	if (unlikely(*head >= limit))
+		return -EFBIG;
 
 	return 0;
 }
 
-static void msi_bitmap_free(struct pci_pbm_info *pbm)
+static int pci_sun4v_dequeue_msi(struct pci_pbm_info *pbm,
+				 unsigned long msiqid, unsigned long *head,
+				 unsigned long *msi)
 {
-	kfree(pbm->msi_bitmap);
-	pbm->msi_bitmap = NULL;
+	struct pci_sun4v_msiq_entry *ep;
+	unsigned long err, type;
+
+	/* Note: void pointer arithmetic, 'head' is a byte offset  */
+	ep = (pbm->msi_queues + ((msiqid - pbm->msiq_first) *
+				 (pbm->msiq_ent_count *
+				  sizeof(struct pci_sun4v_msiq_entry))) +
+	      *head);
+
+	if ((ep->version_type & MSIQ_TYPE_MASK) == 0)
+		return 0;
+
+	type = (ep->version_type & MSIQ_TYPE_MASK) >> MSIQ_TYPE_SHIFT;
+	if (unlikely(type != MSIQ_TYPE_MSI32 &&
+		     type != MSIQ_TYPE_MSI64))
+		return -EINVAL;
+
+	*msi = ep->msi_data;
+
+	err = pci_sun4v_msi_setstate(pbm->devhandle,
+				     ep->msi_data /* msi_num */,
+				     HV_MSISTATE_IDLE);
+	if (unlikely(err))
+		return -ENXIO;
+
+	/* Clear the entry.  */
+	ep->version_type &= ~MSIQ_TYPE_MASK;
+
+	(*head) += sizeof(struct pci_sun4v_msiq_entry);
+	if (*head >=
+	    (pbm->msiq_ent_count * sizeof(struct pci_sun4v_msiq_entry)))
+		*head = 0;
+
+	return 1;
 }
 
-static int msi_queue_alloc(struct pci_pbm_info *pbm)
+static int pci_sun4v_set_head(struct pci_pbm_info *pbm, unsigned long msiqid,
+			      unsigned long head)
+{
+	unsigned long err;
+
+	err = pci_sun4v_msiq_sethead(pbm->devhandle, msiqid, head);
+	if (unlikely(err))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int pci_sun4v_msi_setup(struct pci_pbm_info *pbm, unsigned long msiqid,
+			       unsigned long msi, int is_msi64)
+{
+	if (pci_sun4v_msi_setmsiq(pbm->devhandle, msi, msiqid,
+				  (is_msi64 ?
+				   HV_MSITYPE_MSI64 : HV_MSITYPE_MSI32)))
+		return -ENXIO;
+	if (pci_sun4v_msi_setstate(pbm->devhandle, msi, HV_MSISTATE_IDLE))
+		return -ENXIO;
+	if (pci_sun4v_msi_setvalid(pbm->devhandle, msi, HV_MSIVALID_VALID))
+		return -ENXIO;
+	return 0;
+}
+
+static int pci_sun4v_msi_teardown(struct pci_pbm_info *pbm, unsigned long msi)
+{
+	unsigned long err, msiqid;
+
+	err = pci_sun4v_msi_getmsiq(pbm->devhandle, msi, &msiqid);
+	if (err)
+		return -ENXIO;
+
+	pci_sun4v_msi_setvalid(pbm->devhandle, msi, HV_MSIVALID_INVALID);
+
+	return 0;
+}
+
+static int pci_sun4v_msiq_alloc(struct pci_pbm_info *pbm)
 {
 	unsigned long q_size, alloc_size, pages, order;
 	int i;
@@ -906,232 +897,59 @@
 	return -EINVAL;
 }
 
-
-static int alloc_msi(struct pci_pbm_info *pbm)
+static void pci_sun4v_msiq_free(struct pci_pbm_info *pbm)
 {
+	unsigned long q_size, alloc_size, pages, order;
 	int i;
 
-	for (i = 0; i < pbm->msi_num; i++) {
-		if (!test_and_set_bit(i, pbm->msi_bitmap))
-			return i + pbm->msi_first;
+	for (i = 0; i < pbm->msiq_num; i++) {
+		unsigned long msiqid = pbm->msiq_first + i;
+
+		(void) pci_sun4v_msiq_conf(pbm->devhandle, msiqid, 0UL, 0);
 	}
 
-	return -ENOENT;
+	q_size = pbm->msiq_ent_count * sizeof(struct pci_sun4v_msiq_entry);
+	alloc_size = (pbm->msiq_num * q_size);
+	order = get_order(alloc_size);
+
+	pages = (unsigned long) pbm->msi_queues;
+
+	free_pages(pages, order);
+
+	pbm->msi_queues = NULL;
 }
 
-static void free_msi(struct pci_pbm_info *pbm, int msi_num)
+static int pci_sun4v_msiq_build_irq(struct pci_pbm_info *pbm,
+				    unsigned long msiqid,
+				    unsigned long devino)
 {
-	msi_num -= pbm->msi_first;
-	clear_bit(msi_num, pbm->msi_bitmap);
-}
+	unsigned int virt_irq = sun4v_build_irq(pbm->devhandle, devino);
 
-static int pci_sun4v_setup_msi_irq(unsigned int *virt_irq_p,
-				   struct pci_dev *pdev,
-				   struct msi_desc *entry)
-{
-	struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
-	unsigned long devino, msiqid;
-	struct msi_msg msg;
-	int msi_num, err;
+	if (!virt_irq)
+		return -ENOMEM;
 
-	*virt_irq_p = 0;
-
-	msi_num = alloc_msi(pbm);
-	if (msi_num < 0)
-		return msi_num;
-
-	err = sun4v_build_msi(pbm->devhandle, virt_irq_p,
-			      pbm->msiq_first_devino,
-			      (pbm->msiq_first_devino +
-			       pbm->msiq_num));
-	if (err < 0)
-		goto out_err;
-	devino = err;
-
-	msiqid = ((devino - pbm->msiq_first_devino) +
-		  pbm->msiq_first);
-
-	err = -EINVAL;
 	if (pci_sun4v_msiq_setstate(pbm->devhandle, msiqid, HV_MSIQSTATE_IDLE))
-	if (err)
-		goto out_err;
-
+		return -EINVAL;
 	if (pci_sun4v_msiq_setvalid(pbm->devhandle, msiqid, HV_MSIQ_VALID))
-		goto out_err;
+		return -EINVAL;
 
-	if (pci_sun4v_msi_setmsiq(pbm->devhandle,
-				  msi_num, msiqid,
-				  (entry->msi_attrib.is_64 ?
-				   HV_MSITYPE_MSI64 : HV_MSITYPE_MSI32)))
-		goto out_err;
-
-	if (pci_sun4v_msi_setstate(pbm->devhandle, msi_num, HV_MSISTATE_IDLE))
-		goto out_err;
-
-	if (pci_sun4v_msi_setvalid(pbm->devhandle, msi_num, HV_MSIVALID_VALID))
-		goto out_err;
-
-	sparc64_set_msi(*virt_irq_p, msi_num);
-
-	if (entry->msi_attrib.is_64) {
-		msg.address_hi = pbm->msi64_start >> 32;
-		msg.address_lo = pbm->msi64_start & 0xffffffff;
-	} else {
-		msg.address_hi = 0;
-		msg.address_lo = pbm->msi32_start;
-	}
-	msg.data = msi_num;
-
-	set_irq_msi(*virt_irq_p, entry);
-	write_msi_msg(*virt_irq_p, &msg);
-
-	irq_install_pre_handler(*virt_irq_p,
-				pci_sun4v_msi_prehandler,
-				pbm, (void *) msiqid);
-
-	return 0;
-
-out_err:
-	free_msi(pbm, msi_num);
-	return err;
-
+	return virt_irq;
 }
 
-static void pci_sun4v_teardown_msi_irq(unsigned int virt_irq,
-				       struct pci_dev *pdev)
-{
-	struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
-	unsigned long msiqid, err;
-	unsigned int msi_num;
-
-	msi_num = sparc64_get_msi(virt_irq);
-	err = pci_sun4v_msi_getmsiq(pbm->devhandle, msi_num, &msiqid);
-	if (err) {
-		printk(KERN_ERR "%s: getmsiq gives error %lu\n",
-		       pbm->name, err);
-		return;
-	}
-
-	pci_sun4v_msi_setvalid(pbm->devhandle, msi_num, HV_MSIVALID_INVALID);
-	pci_sun4v_msiq_setvalid(pbm->devhandle, msiqid, HV_MSIQ_INVALID);
-
-	free_msi(pbm, msi_num);
-
-	/* The sun4v_destroy_msi() will liberate the devino and thus the MSIQ
-	 * allocation.
-	 */
-	sun4v_destroy_msi(virt_irq);
-}
+static const struct sparc64_msiq_ops pci_sun4v_msiq_ops = {
+	.get_head	=	pci_sun4v_get_head,
+	.dequeue_msi	=	pci_sun4v_dequeue_msi,
+	.set_head	=	pci_sun4v_set_head,
+	.msi_setup	=	pci_sun4v_msi_setup,
+	.msi_teardown	=	pci_sun4v_msi_teardown,
+	.msiq_alloc	=	pci_sun4v_msiq_alloc,
+	.msiq_free	=	pci_sun4v_msiq_free,
+	.msiq_build_irq	=	pci_sun4v_msiq_build_irq,
+};
 
 static void pci_sun4v_msi_init(struct pci_pbm_info *pbm)
 {
-	const u32 *val;
-	int len;
-
-	val = of_get_property(pbm->prom_node, "#msi-eqs", &len);
-	if (!val || len != 4)
-		goto no_msi;
-	pbm->msiq_num = *val;
-	if (pbm->msiq_num) {
-		const struct msiq_prop {
-			u32 first_msiq;
-			u32 num_msiq;
-			u32 first_devino;
-		} *mqp;
-		const struct msi_range_prop {
-			u32 first_msi;
-			u32 num_msi;
-		} *mrng;
-		const struct addr_range_prop {
-			u32 msi32_high;
-			u32 msi32_low;
-			u32 msi32_len;
-			u32 msi64_high;
-			u32 msi64_low;
-			u32 msi64_len;
-		} *arng;
-
-		val = of_get_property(pbm->prom_node, "msi-eq-size", &len);
-		if (!val || len != 4)
-			goto no_msi;
-
-		pbm->msiq_ent_count = *val;
-
-		mqp = of_get_property(pbm->prom_node,
-				      "msi-eq-to-devino", &len);
-		if (!mqp || len != sizeof(struct msiq_prop))
-			goto no_msi;
-
-		pbm->msiq_first = mqp->first_msiq;
-		pbm->msiq_first_devino = mqp->first_devino;
-
-		val = of_get_property(pbm->prom_node, "#msi", &len);
-		if (!val || len != 4)
-			goto no_msi;
-		pbm->msi_num = *val;
-
-		mrng = of_get_property(pbm->prom_node, "msi-ranges", &len);
-		if (!mrng || len != sizeof(struct msi_range_prop))
-			goto no_msi;
-		pbm->msi_first = mrng->first_msi;
-
-		val = of_get_property(pbm->prom_node, "msi-data-mask", &len);
-		if (!val || len != 4)
-			goto no_msi;
-		pbm->msi_data_mask = *val;
-
-		val = of_get_property(pbm->prom_node, "msix-data-width", &len);
-		if (!val || len != 4)
-			goto no_msi;
-		pbm->msix_data_width = *val;
-
-		arng = of_get_property(pbm->prom_node, "msi-address-ranges",
-				       &len);
-		if (!arng || len != sizeof(struct addr_range_prop))
-			goto no_msi;
-		pbm->msi32_start = ((u64)arng->msi32_high << 32) |
-			(u64) arng->msi32_low;
-		pbm->msi64_start = ((u64)arng->msi64_high << 32) |
-			(u64) arng->msi64_low;
-		pbm->msi32_len = arng->msi32_len;
-		pbm->msi64_len = arng->msi64_len;
-
-		if (msi_bitmap_alloc(pbm))
-			goto no_msi;
-
-		if (msi_queue_alloc(pbm)) {
-			msi_bitmap_free(pbm);
-			goto no_msi;
-		}
-
-		printk(KERN_INFO "%s: MSI Queue first[%u] num[%u] count[%u] "
-		       "devino[0x%x]\n",
-		       pbm->name,
-		       pbm->msiq_first, pbm->msiq_num,
-		       pbm->msiq_ent_count,
-		       pbm->msiq_first_devino);
-		printk(KERN_INFO "%s: MSI first[%u] num[%u] mask[0x%x] "
-		       "width[%u]\n",
-		       pbm->name,
-		       pbm->msi_first, pbm->msi_num, pbm->msi_data_mask,
-		       pbm->msix_data_width);
-		printk(KERN_INFO "%s: MSI addr32[0x%lx:0x%x] "
-		       "addr64[0x%lx:0x%x]\n",
-		       pbm->name,
-		       pbm->msi32_start, pbm->msi32_len,
-		       pbm->msi64_start, pbm->msi64_len);
-		printk(KERN_INFO "%s: MSI queues at RA [%p]\n",
-		       pbm->name,
-		       pbm->msi_queues);
-	}
-	pbm->setup_msi_irq = pci_sun4v_setup_msi_irq;
-	pbm->teardown_msi_irq = pci_sun4v_teardown_msi_irq;
-
-	return;
-
-no_msi:
-	pbm->msiq_num = 0;
-	printk(KERN_INFO "%s: No MSI support.\n", pbm->name);
+	sparc64_pbm_msi_init(pbm, &pci_sun4v_msiq_ops);
 }
 #else /* CONFIG_PCI_MSI */
 static void pci_sun4v_msi_init(struct pci_pbm_info *pbm)
diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h
index 4de3725..bad3c28 100644
--- a/include/asm-sparc64/irq.h
+++ b/include/asm-sparc64/irq.h
@@ -59,8 +59,10 @@
 extern void sun4u_destroy_msi(unsigned int virt_irq);
 extern unsigned int sbus_build_irq(void *sbus, unsigned int ino);
 
-extern void sparc64_set_msi(unsigned int virt_irq, u32 msi);
-extern u32 sparc64_get_msi(unsigned int virt_irq);
+extern unsigned char virt_irq_alloc(unsigned int real_irq);
+#ifdef CONFIG_PCI_MSI
+extern void virt_irq_free(unsigned int virt_irq);
+#endif
 
 extern void fixup_irqs(void);