Merge tag 'vfio-for-linus' of git://github.com/awilliam/linux-vfio

Pull vfio fixes from Alex Williamson:
 "VFIO doc update and virqfd race fix"

* tag 'vfio-for-linus' of git://github.com/awilliam/linux-vfio:
  vfio: Fix virqfd release race
  vfio: Trivial Documentation correction
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 0cb6685..8eda363 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -133,7 +133,7 @@
 $ lspci -n -s 0000:06:0d.0
 06:0d.0 0401: 1102:0002 (rev 08)
 # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
-# echo 1102 0002 > /sys/bus/pci/drivers/vfio/new_id
+# echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
 
 Now we need to look at what other devices are in the group to free
 it for use by VFIO:
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 211a492..d8dedc7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -76,9 +76,24 @@
 			schedule_work(&virqfd->inject);
 	}
 
-	if (flags & POLLHUP)
-		/* The eventfd is closing, detach from VFIO */
-		virqfd_deactivate(virqfd);
+	if (flags & POLLHUP) {
+		unsigned long flags;
+		spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
+
+		/*
+		 * The eventfd is closing, if the virqfd has not yet been
+		 * queued for release, as determined by testing whether the
+		 * vdev pointer to it is still valid, queue it now.  As
+		 * with kvm irqfds, we know we won't race against the virqfd
+		 * going away because we hold wqh->lock to get here.
+		 */
+		if (*(virqfd->pvirqfd) == virqfd) {
+			*(virqfd->pvirqfd) = NULL;
+			virqfd_deactivate(virqfd);
+		}
+
+		spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
+	}
 
 	return 0;
 }
@@ -93,7 +108,6 @@
 static void virqfd_shutdown(struct work_struct *work)
 {
 	struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
-	struct virqfd **pvirqfd = virqfd->pvirqfd;
 	u64 cnt;
 
 	eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
@@ -101,7 +115,6 @@
 	eventfd_ctx_put(virqfd->eventfd);
 
 	kfree(virqfd);
-	*pvirqfd = NULL;
 }
 
 static void virqfd_inject(struct work_struct *work)
@@ -122,15 +135,11 @@
 	int ret = 0;
 	unsigned int events;
 
-	if (*pvirqfd)
-		return -EBUSY;
-
 	virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL);
 	if (!virqfd)
 		return -ENOMEM;
 
 	virqfd->pvirqfd = pvirqfd;
-	*pvirqfd = virqfd;
 	virqfd->vdev = vdev;
 	virqfd->handler = handler;
 	virqfd->thread = thread;
@@ -154,6 +163,23 @@
 	virqfd->eventfd = ctx;
 
 	/*
+	 * virqfds can be released by closing the eventfd or directly
+	 * through ioctl.  These are both done through a workqueue, so
+	 * we update the pointer to the virqfd under lock to avoid
+	 * pushing multiple jobs to release the same virqfd.
+	 */
+	spin_lock_irq(&vdev->irqlock);
+
+	if (*pvirqfd) {
+		spin_unlock_irq(&vdev->irqlock);
+		ret = -EBUSY;
+		goto fail;
+	}
+	*pvirqfd = virqfd;
+
+	spin_unlock_irq(&vdev->irqlock);
+
+	/*
 	 * Install our own custom wake-up handling so we are notified via
 	 * a callback whenever someone signals the underlying eventfd.
 	 */
@@ -187,19 +213,29 @@
 		fput(file);
 
 	kfree(virqfd);
-	*pvirqfd = NULL;
 
 	return ret;
 }
 
-static void virqfd_disable(struct virqfd *virqfd)
+static void virqfd_disable(struct vfio_pci_device *vdev,
+			   struct virqfd **pvirqfd)
 {
-	if (!virqfd)
-		return;
+	unsigned long flags;
 
-	virqfd_deactivate(virqfd);
+	spin_lock_irqsave(&vdev->irqlock, flags);
 
-	/* Block until we know all outstanding shutdown jobs have completed. */
+	if (*pvirqfd) {
+		virqfd_deactivate(*pvirqfd);
+		*pvirqfd = NULL;
+	}
+
+	spin_unlock_irqrestore(&vdev->irqlock, flags);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed.
+	 * Even if we don't queue the job, flush the wq to be sure it's
+	 * been released.
+	 */
 	flush_workqueue(vfio_irqfd_cleanup_wq);
 }
 
@@ -392,8 +428,8 @@
 static void vfio_intx_disable(struct vfio_pci_device *vdev)
 {
 	vfio_intx_set_signal(vdev, -1);
-	virqfd_disable(vdev->ctx[0].unmask);
-	virqfd_disable(vdev->ctx[0].mask);
+	virqfd_disable(vdev, &vdev->ctx[0].unmask);
+	virqfd_disable(vdev, &vdev->ctx[0].mask);
 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
 	vdev->num_ctx = 0;
 	kfree(vdev->ctx);
@@ -539,8 +575,8 @@
 	vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
 
 	for (i = 0; i < vdev->num_ctx; i++) {
-		virqfd_disable(vdev->ctx[i].unmask);
-		virqfd_disable(vdev->ctx[i].mask);
+		virqfd_disable(vdev, &vdev->ctx[i].unmask);
+		virqfd_disable(vdev, &vdev->ctx[i].mask);
 	}
 
 	if (msix) {
@@ -577,7 +613,7 @@
 					     vfio_send_intx_eventfd, NULL,
 					     &vdev->ctx[0].unmask, fd);
 
-		virqfd_disable(vdev->ctx[0].unmask);
+		virqfd_disable(vdev, &vdev->ctx[0].unmask);
 	}
 
 	return 0;