Below is a rough first-pass first cut at splice(2), based on the the
API from Kevin Fall and Sally Floyd's work:
Sally Floyd and Kevin Fall.
Promoting the use of end-to-end congestion control in the Internet.
ACM/IEEE Transactions on Networking, 7(4):458-473, August 1999
The patch uses kcont(9) to stitch together the data from two TCP PCBs.
I've used the patch with a trivial userspace program to forward a
wire-speed gigabit ttcp stream, with the ttcp sender running on
machine A, the ttcp receiver on machine C, and machine B in the middle
using splice(2) to TCP-terminate A's traffic and forward on a separate
connection to machine C;
ttcp ttcp
sender receiver
A B C
in-kernel
TCP splice
Before any thought of committing, I'd want to rework the API, to split
the overloaded length/flags arguments into two separate arguments: an
int for flags, and a u_long or off_t for the size to copy. (though
the length was, historically, for UDP/record traffic)
, and the syscall numbers are for illustration only.
While that's interesting to me, as a prototype testbed for (for
example) 10GbE NICs, my guess is that implementin sendfile() using a
similar kcont scheme would be more interesting to a wider audience.
I'm sure the usual supsects will quickly notice that the patch
supports TCP only. I'm also interested in reworking the
protocol-level APIs to allow a more protocol/PF-neutral interface for
layering upper-level protocols on top of in-kernel APIs, but below the
*userspace* socket API.
comments welcomed.
Index: kern/syscalls.master
RCS file: /,v
retrieving revision 1.145
diff -w -u -r1.145 syscalls.master
kern/syscalls.master25 Feb 2005 19:53:56 -00001.145
kern/syscalls.master1 Sep 2005 23:55:20 -0000
...... -751,3 +751,6 ......
const sigset_t *mask); }
374STD{ int sys_pollts(struct pollfd *fds, u_int nfds, \
const struct timespec *ts, const sigset_t *mask); }
+375STD{ int sys_splice(int fd1, int fd2); }
+376 STD{ int sys_sendfile(int fd, int s, off_t offset, size_t nbytes, \
+ void *hdtr, off_t *sbytes, int flags); }
/dev/null2005-09-07 17:06:10.000000000 -0700
sys_splice.c2005-06-06 13:18:03.000000000 -0700
...... -0,0 +1,378 ......
+
+/*
+ * Copyright 2005 Jonathan Stone.
+ * All rights reserved.
+ */
+#include <sys/syscallargs.h>
+#include <sys/kcont.h>
+
+const char spliceio[] = "spliceio";
+
+intsys_splice(struct lwp *l, void *v, register_t *retval);
+intsplice1(struct socket *s1, struct socket *s2, struct proc *p);
+intsplice_movedata(struct socket *from, struct socket *to,
+ struct proc *p);
+voidsplice_sowake_tramp(struct socket *, caddr_t, int);
+voidsplice_sowake_continuation(void *, void *, int);
+
+typedef struct splice_state {
+
+/* Address of this struct, for userspace syscall to
+ * wait upon for completion. But really for debugging.
+ */
+struct splice_state * splc_self;
+
+/* thought: make these an explicit array? */
+struct socket *splc_so1, *splc_so2;
+
+struct proc * splc_proc;
+
+/* Error (if any) returned to userspace upon completion. */
+int splc_error;
+
+/* Spliced socket state: */
+
+
+/* kcont(s). again, perhaps make these an explicit array? */
+
+struct kc so1_kc;
+struct kc so2_kc;
+
+} splice_state;
+
+
+
+/*
+ * data is ready to read, move that data from one socket's
+ * receive queue to the other socket's send queue. If the send
+ * socket has insufficient space, silently give up: we will try again
+ * later when sowwakeup() is called on the sending socket.
+ */
+int
+splice_movedata(struct socket *from, struct socket *to,
+ struct proc *p)
+{
+long space, sspace;
+struct mbuf *top;
+struct uio uio;
+int error, rcvflags;
+int s;
+
+/*
+ * Compute min(send space, available data).
+ * XXX: needs a rework for sockets with record boundaries
+ * (SCK_DGRAM, SCK_SEQPACKET).
+ */
+space = sbspace(&to->so_snd);
+sspace = from->so_rcv.sb_cc;
+if (sspace < space)
+space = sspace;
+
+if (space == 0)
+return 0;
+
+if (space < 0) {
+printf("splice: negative space %ld from-q %ld to-q %lu\n",
+ space, sbspace(&to->so_snd), from->so_rcv.sb_cc);
+return 0;
+}
+
+/* read off from socket */
+top = NULL;
+bzero(&uio, sizeof(uio));
+uio.uio_resid = space;
+uio.uio_segflg = UISYSSPACE;
+uio.uio_procp = NULL; /* XXX: p */
+uio.uio_iov = NULL;
+uio.uio_iovcnt = 0;
+rcvflags = MSG_DNTWAIT;
+
+/*
+ * XXX JRS: try instead
+ * (*from->so_recv)(from, &nam, &uio, &top, (struct mbuf**)0, &flags)
+ * as in sys/nfs/nfs_socket.c
+ */
+error = soreceive(from, /* &paddr */ NULL, &uio,
+ &top, /*controlp*/ NULL, &rcvflags);
+#if defined(DEBUG)
+printf("asked for %ld got resid %ld\n", space, (long)uio.uio_resid);
+#endif
+
+if (error != 0 || top == NULL) {
+printf("splice: after recv, error = %d, top = %p\n",
+ error, top);
+soshutdown(from, SHUT_RDWR);
+soshutdown(to, SHUT_RDWR);
+goto done;
+}
+
+/* send it */
+#if 0
+error = (*to->so_send)(to, (struct mbuf*)NULL, (struct uio *)NULL,
+ top, NULL, 0, p);
+#else
+s = splsoftnet();
+error = (*to->so_proto->pr_usrreq)(to, PRU_SEND, top, NULL, NULL, p);
+splx(s);
+#endif
+
+done:
+return (error);
+}
+
+
+int
+splice1(struct socket *s1, struct socket *s2, struct proc *p)
+{
+
+int error = 0;;
+
+#define SPLICE_MVEDATA(a, b, p) \
+do { \
+if (soreadable((a)) && sowritable(b))\
+error = splice_movedata((a), (b), (p));\
+} while (0)
+
+SPLICE_MVEDATA(s1, s2, p);
+if (error != 0)
+return error;
+
+SPLICE_MVEDATA(s2, s1, p);
+
+#undef SPLICE_MVEDATA
+
+return error;
+
+
+}
+
+
+void
+splice_sowake_tramp(struct socket *so, caddr_t upcallarg, int mflags)
+{
+/*
+ * This function is called from protocol-specific code
+ * at splsoftnet context, where it is unsafe to
+ * remove or add data to the socket's so_snd or so_rcv queues.
+ * Schedule a kcont to run at a safer time.
+ */
+struct splice_state *splc = (struct splice_state *)upcallarg;
+struct kc *kc;
+
+kc = (so == splc->splc_so1) ? &splc->so1_kc : &splc->so2_kc;
+
+kcont(kc, splice_sowake_continuation, splc, KC_IPL_DEFER_SFTNET);
+kcont_defer(kc, so, 0);
+}
+
+
+/*
+ * Called via kcont(9) from some suitable kcont level,
+ * after splice_sowakeup_tramp() has scheduled the kcont.
+ *
+ * When this function is called, we are no longer being called from
+ * protocol-specific code like (for example) tcp_input(); we can
+ * safely call soreceive() to fetch received data from one socket,
+ * then pass the received data to sosend().
+ */
+void
+splice_sowake_continuation(void * obj, void * kc_env, int status)
+{
+struct socket *so = obj;
+struct splice_state * knot = kc_env;
+struct socket *so1, *so2;
+struct proc *p;
+
+/* Which direction were we called in ? */
+so1 = (so == knot->splc_so1) ? knot->splc_so1 : knot->splc_so2 ;
+so2 = (so == knot->splc_so1) ? knot->splc_so2 : knot->splc_so1 ;
+p = knot->splc_proc;
+
+splice1(so1, so2, p);
+
+/*
+ * XXX: check for clean close in both directions.
+ * when we are done, wakeup the userspace caller sleeping
+ * on the address of our struct splice_state.
+ */
+#define CANSENDFRMT(s1, s2) \
+( ( ((s1)->so_state & SS_CANTRCVMRE) == 0) &&\
+ ( ((s2)->so_state & SS_CANTSENDMRE) == 0)\
+)
+
+
+if (CANSENDFRMT(so1, so2) ||
+ CANSENDFRMT(so2, so1)) {
+ /* can send more; do not close down yet. */
+} else {
+/*DBG*/ printf("splice state %p so %p %p: done\n", knot, so1, so2);
+ wakeup(knot);
+}
+}
+
+
+
+
+/* ARGSUSED */
+int
+sys_splice(struct lwp *l, void *v, register_t *retval)
+{
+
+struct sys_splice_args /**/ {
+ syscallarg(int) fd1;
+syscallarg(int) fd2;
+} /* */ *uap = v;
+
+
+intfd1, fd2;
+struct filedesc*fdp;
+struct file*fp1, *fp2;
+struct socket*so1, *so2;
+struct proc*p;
+interror;
+ints;
+struct splice_state *knot;
+
+p = l->l_proc;
+fdp = p->p_fd;
+error = 0;
+
+fd1 = SCARG(uap, fd1);
+if ((fp1 = fd_getfile(fdp, fd1)) == NULL)
+return (EBADF);
+FILE_USE(fp1);
+
+fd2 = SCARG(uap, fd2);
+if ((fp2 = fd_getfile(fdp, fd2)) == NULL) {
+FILE_UNUSE(fp1, p);
+return (EBADF);
+}
+FILE_USE(fp2);
+
+if (fp1->f_type != DTYPE_SCKET ||
+ fp2->f_type != DTYPE_SCKET) {
+error = ENTSCK;/* XXX EINVAL? */
+goto done;
+}
+
+so1 = (struct socket*) fp1->f_data;
+so2 = (struct socket*) fp2->f_data;
+
+/* Same underlying socket?
+ * XXX: should handle splice-to-self as as special case, for
+ * test/measurement purposes? I think all we have to do
+ * is not acquire the sblock twice (and not release twice).
+ */
+if (so1 == so2) {
+error = EBADF;
+goto done;
+}
+
+
+
+if (so1->so_type != so2->so_type) {
+error = ESCKTNSUPPRT;
+goto done;
+}
+
+/* XXX: implementation is really only SCK-STREAM for now */
+if (so1->so_type != SCK_STREAM) {
+error = ESCKTNSUPPRT;
+goto done;
+}
+
+if ((so1->so_state & SS_ISCNNECTED) == 0 ||
+ (so2->so_state & SS_ISCNNECTED) == 0) {
+error = ENTCNN;
+goto done;
+}
+
+/*
+ * Deadlock avoidance: ensure we obtain locks in a well-ordered
+ * manner irrespective of the order in which callers supply
+ * sockets.
+ */
+if (so2 < so1) {
+struct socket *sotmp = so2;
+ so2 = so1;
+ so1 = sotmp;
+}
+
+sblock(&so1->so_snd, M_WAITK);
+sblock(&so2->so_snd, M_WAITK);
+
+knot = malloc(sizeof(*knot), M_TEMP, M_WAITK|M_ZER);
+if (knot == NULL) {
+error = ENSPC;
+sbunlock(&so1->so_snd);
+sbunlock(&so2->so_snd);
+goto done;
+}
+
+/*DBG*/printf("splice socks %p %p state %p: setting up\n", so1, so2, knot);
+
+
+/*
+ * XXX: set up kconts to shuffle data from one socket
+ * to another, whenever space permits
+ */
+
+knot = malloc(sizeof(*knot), M_TEMP, M_WAITK|M_ZER);
+if (knot == NULL) {
+error = ENSPC;
+goto done;
+}
+
+knot->splc_error = 0;
+knot->splc_so1 = so1;
+knot->splc_so2 = so2;
+knot->splc_proc = p;
+
+/* wire up socket upcalls */
+s = splnet();
+so1->so_upcall = splice_sowake_tramp;
+so1->so_upcallarg = (caddr_t)knot;
+so1->so_rcv.sb_flags |= SB_UPCALL;
+so1->so_snd.sb_flags |= SB_UPCALL;
+
+so2->so_upcall = splice_sowake_tramp;
+so2->so_upcallarg = (caddr_t)knot;
+so2->so_rcv.sb_flags |= SB_UPCALL;
+so2->so_snd.sb_flags |= SB_UPCALL;
+splx(s);
+
+/* wait until both sockets are done */
+tsleep((caddr_t)knot, PSCK, spliceio, 0);
+
+/*DBG*/printf("splice done\n");
+
+/* unwire upcalls */
+s = splnet();
+so1->so_snd.sb_flags &= ~SB_UPCALL;
+so1->so_rcv.sb_flags &= ~SB_UPCALL;
+so1->so_upcall = NULL;
+so1->so_upcallarg = NULL;
+
+so2->so_snd.sb_flags &= ~SB_UPCALL;
+so2->so_rcv.sb_flags &= ~SB_UPCALL;
+so2->so_upcall = NULL;
+so2->so_upcallarg = NULL;
+
+#ifdef notyet
+freesplc:
+#endif
+free(knot, M_TEMP);
+
+done:
+FILE_UNUSE(fp1, p);
+FILE_UNUSE(fp2, p);
+
+return (error);
+}
+
+int
+sys_sendfile(struct lwp *l, void *v, register_t *retval)
+{
+
+return ENSYS;
+}
Index: kern/init_sysent.c
RCS file: /,v
retrieving revision 1.163
diff -w -u -r1.163 init_sysent.c
kern/init_sysent.c27 Feb 2005 00:02:40 -00001.163
kern/init_sysent.c1 Sep 2005 23:55:20 -0000
...... -1,4 +1,4 ......
-/* $NetBSD: init_sysent.c,v 1.163 2005/02/27 00:02:40 perry Exp $ */
+/* $NetBSD$ */
/*
* System call switch table.
...... -8,7 +8,7 ......
*/
#include <sys/cdefs.h>
KERNEL_RCSID(0, "$NetBSD: init_sysent.c,v 1.163 2005/02/27 00:02:40 perry Exp $");
KERNEL_RCSID(0, "$NetBSD$");
#include "opt_ktrace.h"
#include "opt_nfsserver.h"
...... -994,10 +994,10 ......
sys_pselect },/* 373 = pselect */
{ 4, s(struct sys_pollts_args), 0,
sys_pollts },/* 374 = pollts */
-{ 0, 0, 0,
- sys_nosys },/* 375 = filler */
-{ 0, 0, 0,
- sys_nosys },/* 376 = filler */
+{ 2, s(struct sys_splice_args), 0,
+ sys_splice },/* 375 = splice */
+{ 7, s(struct sys_sendfile_args), 0,
+ sys_sendfile },/* 376 = sendfile */
{ 0, 0, 0,
sys_nosys },/* 377 = filler */
{ 0, 0, 0,
Index: kern/syscalls.c
RCS file: /,v
retrieving revision 1.158
diff -w -u -r1.158 syscalls.c
kern/syscalls.c27 Feb 2005 00:02:40 -00001.158
kern/syscalls.c1 Sep 2005 23:55:20 -0000
...... -1,4 +1,4 ......
-/* $NetBSD: syscalls.c,v 1.158 2005/02/27 00:02:40 perry Exp $ */
+/* $NetBSD$ */
/*
* System call names.
...... -8,7 +8,7 ......
*/
#include <sys/cdefs.h>
KERNEL_RCSID(0, "$NetBSD: syscalls.c,v 1.158 2005/02/27 00:02:40 perry Exp $");
KERNEL_RCSID(0, "$NetBSD$");
#if defined(_KERNELPT)
#include "opt_ktrace.h"
...... -510,4 +510,6 ......
"extattr_list_link",/* 372 = extattr_list_link */
"pselect",/* 373 = pselect */
"pollts",/* 374 = pollts */
+"splice",/* 375 = splice */
+"sendfile",/* 376 = sendfile */
};
Index: kern/uipc_syscalls.c
RCS file: /,v
retrieving revision 1.90
diff -w -u -r1.90 uipc_syscalls.c
kern/uipc_syscalls.c26 Feb 2005 21:34:55 -00001.90
kern/uipc_syscalls.c1 Sep 2005 23:55:22 -0000
...... -1116,3 +1116,5 ......
*fpp = fp;
return (0);
}
+
+#include "/"
Index: sys/syscall.h
RCS file: /cvsroot/src/sys/sys/syscall.h,v
retrieving revision 1.156
diff -w -u -r1.156 syscall.h
sys/syscall.h27 Feb 2005 00:03:25 -00001.156
sys/syscall.h1 Sep 2005 23:55:23 -0000
...... -1,4 +1,4 ......
-/* $NetBSD: syscall.h,v 1.156 2005/02/27 00:03:25 perry Exp $ */
+/* $NetBSD$ */
/*
* System call numbers.
...... -1025,5 +1025,11 ......
/* syscall: "pollts" ret: "int" args: "struct pollfd *" "u_int" "const struct timespec *" "const sigset_t *" */
#defineSYS_pollts374
-#defineSYS_MAXSYSCALL375
+/* syscall: "splice" ret: "int" args: "int" "int" */
+#defineSYS_splice375
+
+/* syscall: "sendfile" ret: "int" args: "int" "int" "off_t" "size_t" "void *" "off_t *" "int" */
+#defineSYS_sendfile376
+
+#defineSYS_MAXSYSCALL377
#defineSYS_NSYSENT512
Index: sys/syscallargs.h
RCS file: /,v
retrieving revision 1.138
diff -w -u -r1.138 syscallargs.h
sys/syscallargs.h27 Feb 2005 00:03:25 -00001.138
sys/syscallargs.h1 Sep 2005 23:55:23 -0000
...... -1,4 +1,4 ......
-/* $NetBSD: syscallargs.h,v 1.138 2005/02/27 00:03:25 perry Exp $ */
+/* $NetBSD$ */
/*
* System call argument lists.
...... -1588,6 +1588,21 ......
syscallarg(const sigset_t *) mask;
};
+struct sys_splice_args {
+syscallarg(int) fd1;
+syscallarg(int) fd2;
+};
+
+struct sys_sendfile_args {
+syscallarg(int) fd;
+syscallarg(int) s;
+syscallarg(off_t) offset;
+syscallarg(size_t) nbytes;
+syscallarg(void *) hdtr;
+syscallarg(off_t *) sbytes;
+syscallarg(int) flags;
+};
+
/*
* System call prototypes.
*/
...... -2251,4 +2266,8 ......
intsys_pollts(struct lwp *, void *, register_t *);
+intsys_splice(struct lwp *, void *, register_t *);
+
+intsys_sendfile(struct lwp *, void *, register_t *);
+
#endif /* _SYS__SYSCALLARGS_H_ */