RDS: make m_rdma_op a member of rds_message

This eliminates a separate memory alloc, although
it is now necessary to add an "r_active" flag, since
it is no longer to use the m_rdma_op pointer as an
indicator of if an rdma op is present.

rdma SGs allocated from rm sg pool.

rds_rm_size also gets bigger. It's a little inefficient to
run through CMSGs twice, but it makes later steps a lot smoother.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 575fce4..f0edfdb 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -85,8 +85,8 @@
 			rm->data.m_sg, rm->data.m_nents,
 			DMA_TO_DEVICE);
 
-	if (rm->rdma.m_rdma_op) {
-		rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_op.r_active) {
+		rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@
 		 */
 		rds_ib_send_rdma_complete(rm, wc_status);
 
-		if (rm->rdma.m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+		if (rm->rdma.m_rdma_op.r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -243,8 +243,8 @@
 
 				rm = rds_send_get_message(conn, send->s_op);
 				if (rm) {
-					if (rm->rdma.m_rdma_op)
-						rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+					if (rm->rdma.m_rdma_op.r_active)
+						rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
 					rds_ib_send_rdma_complete(rm, wc.status);
 					rds_message_put(rm);
 				}
@@ -560,10 +560,10 @@
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.m_rdma_op) {
+		if (rm->rdma.m_rdma_op.r_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -601,7 +601,7 @@
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
+	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 62234b8..9b79a1b 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -85,8 +85,8 @@
 		     rm->data.m_sg, rm->data.m_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->rdma.m_rdma_op) {
-		rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_op.r_active) {
+		rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -110,10 +110,10 @@
 		 */
 		rds_iw_send_rdma_complete(rm, wc_status);
 
-		if (rm->rdma.m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+		if (rm->rdma.m_rdma_op.r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -591,10 +591,10 @@
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->rdma.m_rdma_op) {
+		if (rm->rdma.m_rdma_op.r_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -632,7 +632,7 @@
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence)
+	if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
diff --git a/net/rds/message.c b/net/rds/message.c
index fb382fb..4352ce7 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -69,8 +69,8 @@
 	}
 	rm->data.m_nents = 0;
 
-	if (rm->rdma.m_rdma_op)
-		rds_rdma_free_op(rm->rdma.m_rdma_op);
+	if (rm->rdma.m_rdma_op.r_active)
+		rds_rdma_free_op(&rm->rdma.m_rdma_op);
 	if (rm->rdma.m_rdma_mr)
 		rds_mr_put(rm->rdma.m_rdma_mr);
 }
@@ -259,14 +259,17 @@
 {
 	struct rds_message *rm;
 	unsigned int i;
+	int num_sgs = ceil(total_len, PAGE_SIZE);
+	int extra_bytes = num_sgs * sizeof(struct scatterlist);
 
-	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+	rm = rds_message_alloc(extra_bytes, GFP_KERNEL);
 	if (!rm)
 		return ERR_PTR(-ENOMEM);
 
 	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
 	rm->data.m_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs);
 
 	for (i = 0; i < rm->data.m_nents; ++i) {
 		sg_set_page(&rm->data.m_sg[i],
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index a21edad..7ff3379 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -458,26 +458,60 @@
 	}
 
 	kfree(ro->r_notifier);
-	kfree(ro);
+	ro->r_notifier = NULL;
+	ro->r_active = 0;
+}
+
+/*
+ * Count the number of pages needed to describe an incoming iovec.
+ */
+static int rds_rdma_pages(struct rds_rdma_args *args)
+{
+	struct rds_iovec vec;
+	struct rds_iovec __user *local_vec;
+	unsigned int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec)))
+			return -EFAULT;
+
+		nr_pages = rds_pages_in_vec(&vec);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+	}
+
+	return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+	return rds_rdma_pages(args) * sizeof(struct scatterlist);
 }
 
 /*
  * args is a pointer to an in-kernel copy in the sendmsg cmsg.
  */
-static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
-					    struct rds_rdma_args *args)
+static int rds_rdma_prepare(struct rds_message *rm,
+			    struct rds_sock *rs,
+			    struct rds_rdma_args *args)
 {
 	struct rds_iovec vec;
-	struct rds_rdma_op *op = NULL;
+	struct rds_rdma_op *op = &rm->rdma.m_rdma_op;
 	unsigned int nr_pages;
-	unsigned int max_pages;
 	unsigned int nr_bytes;
 	struct page **pages = NULL;
 	struct rds_iovec __user *local_vec;
-	struct scatterlist *sg;
 	unsigned int nr;
 	unsigned int i, j;
-	int ret;
+	int ret = 0;
 
 
 	if (rs->rs_bound_addr == 0) {
@@ -490,44 +524,21 @@
 		goto out;
 	}
 
-	nr_pages = 0;
-	max_pages = 0;
+	nr_pages = rds_rdma_pages(args);
+	if (nr_pages < 0)
+		goto out;
 
-	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
-	/* figure out the number of pages in the vector */
-	for (i = 0; i < args->nr_local; i++) {
-		if (copy_from_user(&vec, &local_vec[i],
-				   sizeof(struct rds_iovec))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		nr = rds_pages_in_vec(&vec);
-		if (nr == 0) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		max_pages = max(nr, max_pages);
-		nr_pages += nr;
-	}
-
-	pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
-	if (!op) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
+	op->r_sg = rds_message_alloc_sgs(rm, nr_pages);
 	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
 	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
 	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->r_active = 1;
 	op->r_recverr = rs->rs_recverr;
 	WARN_ON(!nr_pages);
 	sg_init_table(op->r_sg, nr_pages);
@@ -564,6 +575,8 @@
 	       (unsigned long long)args->remote_vec.addr,
 	       op->r_key);
 
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
 	for (i = 0; i < args->nr_local; i++) {
 		if (copy_from_user(&vec, &local_vec[i],
 				   sizeof(struct rds_iovec))) {
@@ -580,11 +593,6 @@
 		rs->rs_user_addr = vec.addr;
 		rs->rs_user_bytes = vec.bytes;
 
-		/* did the user change the vec under us? */
-		if (nr > max_pages || op->r_nents + nr > nr_pages) {
-			ret = -EINVAL;
-			goto out;
-		}
 		/* If it's a WRITE operation, we want to pin the pages for reading.
 		 * If it's a READ operation, we need to pin the pages for writing.
 		 */
@@ -599,6 +607,7 @@
 
 		for (j = 0; j < nr; j++) {
 			unsigned int offset = vec.addr & ~PAGE_MASK;
+			struct scatterlist *sg;
 
 			sg = &op->r_sg[op->r_nents + j];
 			sg_set_page(sg, pages[j],
@@ -628,12 +637,10 @@
 	ret = 0;
 out:
 	kfree(pages);
-	if (ret) {
-		if (op)
-			rds_rdma_free_op(op);
-		op = ERR_PTR(ret);
-	}
-	return op;
+	if (ret)
+		rds_rdma_free_op(op);
+
+	return ret;
 }
 
 /*
@@ -643,17 +650,17 @@
 int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 			  struct cmsghdr *cmsg)
 {
-	struct rds_rdma_op *op;
+	int ret;
 
 	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-	    rm->rdma.m_rdma_op)
+	    rm->rdma.m_rdma_op.r_active)
 		return -EINVAL;
 
-	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
-	if (IS_ERR(op))
-		return PTR_ERR(op);
+	ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg));
+	if (ret)
+		return ret;
+
 	rds_stats_inc(s_send_rdma);
-	rm->rdma.m_rdma_op = op;
 	return 0;
 }
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 7c4adbe..0bb4957 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -316,7 +316,7 @@
 	rds_rdma_cookie_t	m_rdma_cookie;
 	struct {
 		struct {
-			struct rds_rdma_op	*m_rdma_op;
+			struct rds_rdma_op	m_rdma_op;
 			struct rds_mr		*m_rdma_mr;
 		} rdma;
 		struct {
diff --git a/net/rds/send.c b/net/rds/send.c
index 89e26ff..72dbe7f 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -235,7 +235,7 @@
 			 * connection.
 			 * Therefore, we never retransmit messages with RDMA ops.
 			 */
-			if (rm->rdma.m_rdma_op &&
+			if (rm->rdma.m_rdma_op.r_active &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
 				spin_lock_irqsave(&conn->c_lock, flags);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
@@ -267,8 +267,8 @@
 		 * keep this simple and require that the transport either
 		 * send the whole rdma or none of it.
 		 */
-		if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) {
-			ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op);
+		if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) {
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op);
 			if (ret)
 				break;
 			conn->c_xmit_rdma_sent = 1;
@@ -418,9 +418,9 @@
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	ro = rm->rdma.m_rdma_op;
+	ro = &rm->rdma.m_rdma_op;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro && ro->r_notify && ro->r_notifier) {
+	    ro->r_active && ro->r_notify && ro->r_notifier) {
 		notifier = ro->r_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
@@ -452,8 +452,8 @@
 {
 	struct rds_rdma_op *ro;
 
-	ro = rm->rdma.m_rdma_op;
-	if (ro && ro->r_notify && ro->r_notifier) {
+	ro = &rm->rdma.m_rdma_op;
+	if (ro->r_active && ro->r_notify && ro->r_notifier) {
 		ro->r_notifier->n_status = status;
 		list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
 		ro->r_notifier = NULL;
@@ -476,7 +476,7 @@
 	spin_lock_irqsave(&conn->c_lock, flags);
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (rm->rdma.m_rdma_op == op) {
+		if (&rm->rdma.m_rdma_op == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			goto out;
@@ -484,7 +484,7 @@
 	}
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (rm->rdma.m_rdma_op == op) {
+		if (&rm->rdma.m_rdma_op == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			break;
@@ -544,19 +544,20 @@
 		spin_lock(&rs->rs_lock);
 
 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-			struct rds_rdma_op *ro = rm->rdma.m_rdma_op;
+			struct rds_rdma_op *ro = &rm->rdma.m_rdma_op;
 			struct rds_notifier *notifier;
 
 			list_del_init(&rm->m_sock_item);
 			rds_send_sndbuf_remove(rs, rm);
 
-			if (ro && ro->r_notifier && (status || ro->r_notify)) {
+			if (ro->r_active && ro->r_notifier &&
+			    (status || ro->r_notify)) {
 				notifier = ro->r_notifier;
 				list_add_tail(&notifier->n_list,
 						&rs->rs_notify_queue);
 				if (!notifier->n_status)
 					notifier->n_status = status;
-				rm->rdma.m_rdma_op->r_notifier = NULL;
+				rm->rdma.m_rdma_op.r_notifier = NULL;
 			}
 			was_on_sock = 1;
 			rm->m_rs = NULL;
@@ -763,9 +764,37 @@
  */
 static int rds_rm_size(struct msghdr *msg, int data_len)
 {
+	struct cmsghdr *cmsg;
 	int size = 0;
+	int retval;
 
-	size +=	ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+			if (retval < 0)
+				return retval;
+			size += retval;
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+		case RDS_CMSG_RDMA_MAP:
+			/* these are valid but do no add any size */
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	}
+
+	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
 
 	return size;
 }
@@ -896,11 +925,11 @@
 	if (ret)
 		goto out;
 
-	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) &&
+	if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) &&
 	    !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-				rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
+				&rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}