drm/nouveau: port all engines to new engine module format

This is a HUGE commit, but it's not nearly as bad as it looks - any problems
can be isolated to a particular chipset and engine combination.  It was
simply too difficult to port each one at a time, the compat layers are
*already* ridiculous.

Most of the changes here are simply to the glue, the process for each of the
engine modules was to start with a standard skeleton and copy+paste the old
code into the appropriate places, fixing up variable names etc as needed.

v2: Marcin Slusarz <marcin.slusarz@gmail.com>
- fix find/replace bug in license header

v3: Ben Skeggs <bskeggs@redhat.com>
- bump indirect pushbuf size to 8KiB, 4KiB barely enough for userspace and
  left no space for kernel's requirements during GEM pushbuf submission.
- fix duplicate assignments noticed by clang

v4: Marcin Slusarz <marcin.slusarz@gmail.com>
- add sparse annotations to nv04_fifo_pause/nv04_fifo_start
- use ioread32_native/iowrite32_native for fifo control registers

v5: Ben Skeggs <bskeggs@redhat.com>
- rebase on v3.6-rc4, modified to keep copy engine fix intact
- nv10/fence: unmap fence bo before destroying
- fixed fermi regression when using nvidia gr fuc
- fixed typo in supported dma_mask checking

Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c b/drivers/gpu/drm/nouveau/nouveau_dma.c
index efd0823..40f91e1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
@@ -24,41 +24,16 @@
  *
  */
 
-#include "drmP.h"
-#include "drm.h"
-#include "nouveau_drv.h"
+#include <core/client.h>
+
+#include "nouveau_drm.h"
 #include "nouveau_dma.h"
-#include <core/ramht.h>
-
-void
-nouveau_dma_init(struct nouveau_channel *chan)
-{
-	struct drm_nouveau_private *dev_priv = chan->dev->dev_private;
-	struct nouveau_bo *pushbuf = chan->pushbuf_bo;
-
-	if (dev_priv->card_type >= NV_50) {
-		const int ib_size = pushbuf->bo.mem.size / 2;
-
-		chan->dma.ib_base = (pushbuf->bo.mem.size - ib_size) >> 2;
-		chan->dma.ib_max = (ib_size / 8) - 1;
-		chan->dma.ib_put = 0;
-		chan->dma.ib_free = chan->dma.ib_max - chan->dma.ib_put;
-
-		chan->dma.max = (pushbuf->bo.mem.size - ib_size) >> 2;
-	} else {
-		chan->dma.max  = (pushbuf->bo.mem.size >> 2) - 2;
-	}
-
-	chan->dma.put  = 0;
-	chan->dma.cur  = chan->dma.put;
-	chan->dma.free = chan->dma.max - chan->dma.cur;
-}
 
 void
 OUT_RINGp(struct nouveau_channel *chan, const void *data, unsigned nr_dwords)
 {
 	bool is_iomem;
-	u32 *mem = ttm_kmap_obj_virtual(&chan->pushbuf_bo->kmap, &is_iomem);
+	u32 *mem = ttm_kmap_obj_virtual(&chan->push.buffer->kmap, &is_iomem);
 	mem = &mem[chan->dma.cur];
 	if (is_iomem)
 		memcpy_toio((void __force __iomem *)mem, data, nr_dwords * 4);
@@ -79,9 +54,9 @@
 {
 	uint64_t val;
 
-	val = nvchan_rd32(chan, chan->user_get);
+	val = nv_ro32(chan->object, chan->user_get);
         if (chan->user_get_hi)
-                val |= (uint64_t)nvchan_rd32(chan, chan->user_get_hi) << 32;
+                val |= (uint64_t)nv_ro32(chan->object, chan->user_get_hi) << 32;
 
 	/* reset counter as long as GET is still advancing, this is
 	 * to avoid misdetecting a GPU lockup if the GPU happens to
@@ -93,32 +68,33 @@
 	}
 
 	if ((++*timeout & 0xff) == 0) {
-		DRM_UDELAY(1);
+		udelay(1);
 		if (*timeout > 100000)
 			return -EBUSY;
 	}
 
-	if (val < chan->pushbuf_base ||
-	    val > chan->pushbuf_base + (chan->dma.max << 2))
+	if (val < chan->push.vma.offset ||
+	    val > chan->push.vma.offset + (chan->dma.max << 2))
 		return -EINVAL;
 
-	return (val - chan->pushbuf_base) >> 2;
+	return (val - chan->push.vma.offset) >> 2;
 }
 
 void
 nv50_dma_push(struct nouveau_channel *chan, struct nouveau_bo *bo,
 	      int delta, int length)
 {
-	struct nouveau_bo *pb = chan->pushbuf_bo;
+	struct nouveau_bo *pb = chan->push.buffer;
 	struct nouveau_vma *vma;
 	int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
 	u64 offset;
 
-	vma = nouveau_bo_vma_find(bo, chan->vm);
+	vma = nouveau_bo_vma_find(bo, nv_client(chan->cli)->vm);
 	BUG_ON(!vma);
 	offset = vma->offset + delta;
 
 	BUG_ON(chan->dma.ib_free < 1);
+
 	nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
 	nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
 
@@ -128,7 +104,7 @@
 	/* Flush writes. */
 	nouveau_bo_rd32(pb, 0);
 
-	nvchan_wr32(chan, 0x8c, chan->dma.ib_put);
+	nv_wo32(chan->object, 0x8c, chan->dma.ib_put);
 	chan->dma.ib_free--;
 }
 
@@ -138,7 +114,7 @@
 	uint32_t cnt = 0, prev_get = 0;
 
 	while (chan->dma.ib_free < count) {
-		uint32_t get = nvchan_rd32(chan, 0x88);
+		uint32_t get = nv_ro32(chan->object, 0x88);
 		if (get != prev_get) {
 			prev_get = get;
 			cnt = 0;
@@ -249,7 +225,7 @@
 			 * instruct the GPU to jump back to the start right
 			 * after processing the currently pending commands.
 			 */
-			OUT_RING(chan, chan->pushbuf_base | 0x20000000);
+			OUT_RING(chan, chan->push.vma.offset | 0x20000000);
 
 			/* wait for GET to depart from the skips area.
 			 * prevents writing GET==PUT and causing a race