net: Add sendmmsg socket system call
This patch adds a multiple message send syscall and is the send
version of the existing recvmmsg syscall. This is heavily
based on the patch by Arnaldo that added recvmmsg.
I wrote a microbenchmark to test the performance gains of using
this new syscall:
http://ozlabs.org/~anton/junkcode/sendmmsg_test.c
The test was run on a ppc64 box with a 10 Gbit network card. The
benchmark can send both UDP and RAW ethernet packets.
64B UDP
batch pkts/sec
1 804570
2 872800 (+ 8 %)
4 916556 (+14 %)
8 939712 (+17 %)
16 952688 (+18 %)
32 956448 (+19 %)
64 964800 (+20 %)
64B raw socket
batch pkts/sec
1 1201449
2 1350028 (+12 %)
4 1461416 (+22 %)
8 1513080 (+26 %)
16 1541216 (+28 %)
32 1553440 (+29 %)
64 1557888 (+30 %)
We see a 20% improvement in throughput on UDP send and 30%
on raw socket send.
[ Add sparc syscall entries. -DaveM ]
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 60f64b1..8489d37 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -352,3 +352,4 @@
COMPAT_SYS_SPU(open_by_handle_at)
COMPAT_SYS_SPU(clock_adjtime)
SYSCALL_SPU(syncfs)
+COMPAT_SYS_SPU(sendmmsg)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3c21564..6d23c81 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -371,10 +371,11 @@
#define __NR_open_by_handle_at 346
#define __NR_clock_adjtime 347
#define __NR_syncfs 348
+#define __NR_sendmmsg 349
#ifdef __KERNEL__
-#define __NR_syscalls 349
+#define __NR_syscalls 350
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index 9d897b6..c5387ed 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -404,8 +404,9 @@
#define __NR_open_by_handle_at 333
#define __NR_clock_adjtime 334
#define __NR_syncfs 335
+#define __NR_sendmmsg 336
-#define NR_syscalls 336
+#define NR_syscalls 337
#ifdef __32bit_syscall_numbers__
/* Sparc 32-bit only has the "setresuid32", "getresuid32" variants,
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 47ac73c..332c83f 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -84,4 +84,4 @@
/*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg, sys_fanotify_init
/*330*/ .long sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime
-/*335*/ .long sys_syncfs
+/*335*/ .long sys_syncfs, sys_sendmmsg
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 4f3170c..43887ca 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -85,7 +85,7 @@
/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg, sys_fanotify_init
/*330*/ .word sys32_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime
- .word sys_syncfs
+ .word sys_syncfs, compat_sys_sendmmsg
#endif /* CONFIG_COMPAT */
@@ -162,4 +162,4 @@
/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg, sys_fanotify_init
/*330*/ .word sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime
- .word sys_syncfs
+ .word sys_syncfs, sys_sendmmsg
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 849a9d2..95f5826 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -848,4 +848,5 @@
.quad compat_sys_open_by_handle_at
.quad compat_sys_clock_adjtime
.quad sys_syncfs
+ .quad compat_sys_sendmmsg /* 345 */
ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index a755ef5..fb6a625 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -350,10 +350,11 @@
#define __NR_open_by_handle_at 342
#define __NR_clock_adjtime 343
#define __NR_syncfs 344
+#define __NR_sendmmsg 345
#ifdef __KERNEL__
-#define NR_syscalls 345
+#define NR_syscalls 346
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 160fa76..79f90eb 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -677,6 +677,8 @@
__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
#define __NR_syncfs 306
__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_sendmmsg 307
+__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d..32cbffb 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,4 @@
.long sys_open_by_handle_at
.long sys_clock_adjtime
.long sys_syncfs
+ .long sys_sendmmsg /* 345 */
diff --git a/include/linux/net.h b/include/linux/net.h
index 94de83c..1da55e9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -42,6 +42,7 @@
#define SYS_RECVMSG 17 /* sys_recvmsg(2) */
#define SYS_ACCEPT4 18 /* sys_accept4(2) */
#define SYS_RECVMMSG 19 /* sys_recvmmsg(2) */
+#define SYS_SENDMMSG 20 /* sys_sendmmsg(2) */
typedef enum {
SS_FREE = 0, /* not allocated */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index d2b5e98..4ef98e4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -333,5 +333,7 @@
extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
unsigned int flags, struct timespec *timeout);
+extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
+ unsigned int vlen, unsigned int flags);
#endif /* not kernel and not glibc */
#endif /* _LINUX_SOCKET_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 83ecc17..ab71447 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -610,6 +610,8 @@
asmlinkage long sys_sendto(int, void __user *, size_t, unsigned,
struct sockaddr __user *, int);
asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
+ unsigned int vlen, unsigned flags);
asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
struct sockaddr __user *, int __user *);
diff --git a/include/net/compat.h b/include/net/compat.h
index 28d5428..9ee75ed 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -43,6 +43,8 @@
extern int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *);
extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int);
extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned);
+extern asmlinkage long compat_sys_sendmmsg(int, struct compat_mmsghdr __user *,
+ unsigned, unsigned);
extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned);
extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *,
unsigned, unsigned,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 25cc41c..97e966f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@
cond_syscall(compat_sys_getsockopt);
cond_syscall(sys_shutdown);
cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
cond_syscall(sys_recvmsg);
cond_syscall(sys_recvmmsg);
cond_syscall(compat_sys_recvmsg);
diff --git a/net/compat.c b/net/compat.c
index 3649d58..c578d93 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -722,11 +722,11 @@
/* Argument list sizes for compat_sys_socketcall */
#define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[20] = {
+static unsigned char nas[21] = {
AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
- AL(4), AL(5)
+ AL(4), AL(5), AL(4)
};
#undef AL
@@ -735,6 +735,13 @@
return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
}
+asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+ unsigned vlen, unsigned int flags)
+{
+ return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT);
+}
+
asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags)
{
return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
@@ -780,7 +787,7 @@
u32 a[6];
u32 a0, a1;
- if (call < SYS_SOCKET || call > SYS_RECVMMSG)
+ if (call < SYS_SOCKET || call > SYS_SENDMMSG)
return -EINVAL;
if (copy_from_user(a, args, nas[call]))
return -EFAULT;
@@ -839,6 +846,9 @@
case SYS_SENDMSG:
ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
break;
+ case SYS_SENDMMSG:
+ ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]);
+ break;
case SYS_RECVMSG:
ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
diff --git a/net/socket.c b/net/socket.c
index d25f5a9..ed50255 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -551,11 +551,10 @@
}
EXPORT_SYMBOL(sock_tx_timestamp);
-static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size)
+static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *msg, size_t size)
{
struct sock_iocb *si = kiocb_to_siocb(iocb);
- int err;
sock_update_classid(sock->sk);
@@ -564,13 +563,17 @@
si->msg = msg;
si->size = size;
- err = security_socket_sendmsg(sock, msg, size);
- if (err)
- return err;
-
return sock->ops->sendmsg(iocb, sock, msg, size);
}
+static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *msg, size_t size)
+{
+ int err = security_socket_sendmsg(sock, msg, size);
+
+ return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size);
+}
+
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct kiocb iocb;
@@ -586,6 +589,20 @@
}
EXPORT_SYMBOL(sock_sendmsg);
+int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct kiocb iocb;
+ struct sock_iocb siocb;
+ int ret;
+
+ init_sync_kiocb(&iocb, NULL);
+ iocb.private = &siocb;
+ ret = __sock_sendmsg_nosec(&iocb, sock, msg, size);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&iocb);
+ return ret;
+}
+
int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
struct kvec *vec, size_t num, size_t size)
{
@@ -1863,57 +1880,47 @@
#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen)
#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags)
-/*
- * BSD sendmsg interface
- */
-
-SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
+static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
+ struct msghdr *msg_sys, unsigned flags, int nosec)
{
struct compat_msghdr __user *msg_compat =
(struct compat_msghdr __user *)msg;
- struct socket *sock;
struct sockaddr_storage address;
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
unsigned char ctl[sizeof(struct cmsghdr) + 20]
__attribute__ ((aligned(sizeof(__kernel_size_t))));
/* 20 is size of ipv6_pktinfo */
unsigned char *ctl_buf = ctl;
- struct msghdr msg_sys;
int err, ctl_len, iov_size, total_len;
- int fput_needed;
err = -EFAULT;
if (MSG_CMSG_COMPAT & flags) {
- if (get_compat_msghdr(&msg_sys, msg_compat))
+ if (get_compat_msghdr(msg_sys, msg_compat))
return -EFAULT;
- } else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+ } else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
return -EFAULT;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
-
/* do not move before msg_sys is valid */
err = -EMSGSIZE;
- if (msg_sys.msg_iovlen > UIO_MAXIOV)
- goto out_put;
+ if (msg_sys->msg_iovlen > UIO_MAXIOV)
+ goto out;
/* Check whether to allocate the iovec area */
err = -ENOMEM;
- iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
- if (msg_sys.msg_iovlen > UIO_FASTIOV) {
+ iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+ if (msg_sys->msg_iovlen > UIO_FASTIOV) {
iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
if (!iov)
- goto out_put;
+ goto out;
}
/* This will also move the address data into kernel space */
if (MSG_CMSG_COMPAT & flags) {
- err = verify_compat_iovec(&msg_sys, iov,
+ err = verify_compat_iovec(msg_sys, iov,
(struct sockaddr *)&address,
VERIFY_READ);
} else
- err = verify_iovec(&msg_sys, iov,
+ err = verify_iovec(msg_sys, iov,
(struct sockaddr *)&address,
VERIFY_READ);
if (err < 0)
@@ -1922,17 +1929,17 @@
err = -ENOBUFS;
- if (msg_sys.msg_controllen > INT_MAX)
+ if (msg_sys->msg_controllen > INT_MAX)
goto out_freeiov;
- ctl_len = msg_sys.msg_controllen;
+ ctl_len = msg_sys->msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
err =
- cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl,
+ cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
sizeof(ctl));
if (err)
goto out_freeiov;
- ctl_buf = msg_sys.msg_control;
- ctl_len = msg_sys.msg_controllen;
+ ctl_buf = msg_sys->msg_control;
+ ctl_len = msg_sys->msg_controllen;
} else if (ctl_len) {
if (ctl_len > sizeof(ctl)) {
ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
@@ -1941,21 +1948,22 @@
}
err = -EFAULT;
/*
- * Careful! Before this, msg_sys.msg_control contains a user pointer.
+ * Careful! Before this, msg_sys->msg_control contains a user pointer.
* Afterwards, it will be a kernel pointer. Thus the compiler-assisted
* checking falls down on this.
*/
if (copy_from_user(ctl_buf,
- (void __user __force *)msg_sys.msg_control,
+ (void __user __force *)msg_sys->msg_control,
ctl_len))
goto out_freectl;
- msg_sys.msg_control = ctl_buf;
+ msg_sys->msg_control = ctl_buf;
}
- msg_sys.msg_flags = flags;
+ msg_sys->msg_flags = flags;
if (sock->file->f_flags & O_NONBLOCK)
- msg_sys.msg_flags |= MSG_DONTWAIT;
- err = sock_sendmsg(sock, &msg_sys, total_len);
+ msg_sys->msg_flags |= MSG_DONTWAIT;
+ err = (nosec ? sock_sendmsg_nosec : sock_sendmsg)(sock, msg_sys,
+ total_len);
out_freectl:
if (ctl_buf != ctl)
@@ -1963,12 +1971,114 @@
out_freeiov:
if (iov != iovstack)
sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
+out:
+ return err;
+}
+
+/*
+ * BSD sendmsg interface
+ */
+
+SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
+{
+ int fput_needed, err;
+ struct msghdr msg_sys;
+ struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+ if (!sock)
+ goto out;
+
+ err = __sys_sendmsg(sock, msg, &msg_sys, flags, 0);
+
fput_light(sock->file, fput_needed);
out:
return err;
}
+/*
+ * Linux sendmmsg interface
+ */
+
+int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+ unsigned int flags)
+{
+ int fput_needed, err, datagrams;
+ struct socket *sock;
+ struct mmsghdr __user *entry;
+ struct compat_mmsghdr __user *compat_entry;
+ struct msghdr msg_sys;
+
+ datagrams = 0;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ return err;
+
+ err = sock_error(sock->sk);
+ if (err)
+ goto out_put;
+
+ entry = mmsg;
+ compat_entry = (struct compat_mmsghdr __user *)mmsg;
+
+ while (datagrams < vlen) {
+ /*
+ * No need to ask LSM for more than the first datagram.
+ */
+ if (MSG_CMSG_COMPAT & flags) {
+ err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry,
+ &msg_sys, flags, datagrams);
+ if (err < 0)
+ break;
+ err = __put_user(err, &compat_entry->msg_len);
+ ++compat_entry;
+ } else {
+ err = __sys_sendmsg(sock, (struct msghdr __user *)entry,
+ &msg_sys, flags, datagrams);
+ if (err < 0)
+ break;
+ err = put_user(err, &entry->msg_len);
+ ++entry;
+ }
+
+ if (err)
+ break;
+ ++datagrams;
+ }
+
+out_put:
+ fput_light(sock->file, fput_needed);
+
+ if (err == 0)
+ return datagrams;
+
+ if (datagrams != 0) {
+ /*
+ * We may send less entries than requested (vlen) if the
+ * sock is non blocking...
+ */
+ if (err != -EAGAIN) {
+ /*
+ * ... or if sendmsg returns an error after we
+ * send some datagrams, where we record the
+ * error to return on the next call or if the
+ * app asks about it using getsockopt(SO_ERROR).
+ */
+ sock->sk->sk_err = -err;
+ }
+
+ return datagrams;
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags)
+{
+ return __sys_sendmmsg(fd, mmsg, vlen, flags);
+}
+
static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
struct msghdr *msg_sys, unsigned flags, int nosec)
{
@@ -2214,11 +2324,11 @@
#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[20] = {
+static const unsigned char nargs[21] = {
AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
- AL(4), AL(5)
+ AL(4), AL(5), AL(4)
};
#undef AL
@@ -2238,7 +2348,7 @@
int err;
unsigned int len;
- if (call < 1 || call > SYS_RECVMMSG)
+ if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;
len = nargs[call];
@@ -2313,6 +2423,9 @@
case SYS_SENDMSG:
err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
+ case SYS_SENDMMSG:
+ err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
+ break;
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
break;