[PATCH] Using TCP_NODELAY unconditionally

Tobias Ringstrom tori at ringstrom.mine.nu
Mon Jan 21 10:12:57 EST 2002


On Sun, 20 Jan 2002, Kevin Steves wrote:

> On Sun, 20 Jan 2002, Tobias Ringstrom wrote:
> :Please let me know if you still want a tcpdump!  I cannot produce one
> :right now because one of my computers is busy, and I need to sleep.
> 
> yes, i would like to see analysis before we change anything.  also, among
> other things, see: http://citeseer.nj.nec.com/minshall99application.html

Alright - I only hope you have the stamina to examine my data now that
I've worked so hard to produce it...  :-)

The test is to transfer the file gcc-3.0.2.tar.gz from boris (PII-233,
sftp server) to igor (Athlon-1000, sftp client), both running Linux
2.4.17.  The file is cached at the server and writted to /dev/null on the
client.  The network is an otherwise idle 100 Mbit full duplex switched
ethernet.  Both machines are mostly idle.  I tried very hard not to mix
patches/tests by compiling all versions, and installing them in separate
directories.  I also inspected the installed config files, and run strace
(c.f truss) to be sure that the right binaries was used. I've run four
tests:

1. sftp
   No TCP_NODELAY and no overlapped requests  (unpatched 3.0.2p1)
2. sftp.nodelay
   TCP_NODELAY but no overlapped requests
3. sftp.overlap
   No TCP_NODELAY but overlapped requests
4. TCP_NODELAY and overlapped requests

For the overlapped tests, the overlap is 1, i.e. two active read requests 
at any time.  The following information is collected:

1. ifstat (my own utility, running on the Athlon (sftp client)
       KiBi - kilobytes in per second
       KiBo - kilobytes out per second
       pi - packets in per second
       po - packets out per second
2. vmstat on both boris and igor (one second interval)
3. A one second snippet of tcpdump data from both sides (steady state)
   (full binary tcpdumps available if needed)

In this test, the network is not the limiting factor, and it is a very low
latency link.  The sftp server boris is the bottle-neck.

As can be seen, either overlapping requests _or_ TCP_NODELAY gives good
performance in this test.

When running on the local interface on the Athlon, TCP_NODELAY gives much
better performance than one overlapping request.  Increasing the number of
overlapping requests to something large (10) achieves almost the same
performance as with TCP_NODELAY.  With TCP_NODELAY, overlapping requests 
does not make a noticable difference.

I did most of my previous testing on the local interface which led me to
believe that the nodelay patch needed to go in first, but from this test
run, I conclude that the overlap patch works very well on its own, and can
be applied at any time.  I'm guessing that it will take some time to reach
a conclusion regarding the Nagle issue...

I've also attached the patches I used for anyone to play with (or apply to 
the CVS tree... :-)

/Tobias
-------------- next part --------------
diff -ru openssh-3.0.2p1.orig/sftp-client.c openssh-3.0.2p1.sftp/sftp-client.c
--- openssh-3.0.2p1.orig/sftp-client.c	Wed Jul 18 17:45:45 2001
+++ openssh-3.0.2p1.sftp/sftp-client.c	Sun Jan  6 23:13:44 2002
@@ -45,6 +45,15 @@
 /* How much data to read/write at at time during copies */
 /* XXX: what should this be? */
 #define COPY_SIZE	8192
+/* Maximum number of overlapping requests */
+#define OVERLAPPING_REQUESTS 1
+
+/* A read/write request */
+struct request {
+	u_int id;
+	u_int len;
+	u_int64_t offset;
+};
 
 /* Message ID */
 static u_int msg_id = 1;
@@ -215,6 +224,44 @@
 	return(a);
 }
 
+static int
+find_request(struct request *rq, int num, u_int id)
+{
+	int i;
+
+	for (i = 0; i < num; ++i) {
+		if (rq[i].id == id)
+			break;
+	}
+
+	if (i == num)
+		fatal("Request ID mismatch (%d)", id);
+
+	return i;
+}
+
+static void
+remove_request(struct request *rq, int *num, int i)
+{
+	memmove(rq + i, rq + i + 1, (*num - i - 1) * sizeof(struct request));
+	--*num;
+}
+
+static void
+send_request(int fd, const char *handle, u_int handle_len, int type,
+	     const struct request *rq, Buffer *m)
+{
+    buffer_clear(m);
+    buffer_put_char(m, SSH2_FXP_READ);
+    buffer_put_int(m, rq->id);
+    buffer_put_string(m, handle, handle_len);
+    buffer_put_int64(m, rq->offset);
+    buffer_put_int(m, rq->len);
+    send_msg(fd, m);
+    debug3("Sent message SSH2_FXP_READ I:%d O:%llu S:%u",
+	   rq->id, rq->offset, rq->len);
+}
+
 int
 do_init(int fd_in, int fd_out)
 {
@@ -674,12 +721,15 @@
     int pflag)
 {
 	int local_fd;
-	u_int expected_id, handle_len, mode, type, id;
+	u_int handle_len, mode, type, id;
 	u_int64_t offset;
 	char *handle;
 	Buffer msg;
 	Attrib junk, *a;
 	int status;
+	struct request req[OVERLAPPING_REQUESTS + 1];
+	int num_req = 0, max_req = 1, reply;
+	int write_error = 0, read_error = 0, write_errno;
 
 	a = do_stat(fd_in, fd_out, remote_path, 0);
 	if (a == NULL)
@@ -726,87 +776,103 @@
 
 	/* Read from remote and write to local */
 	offset = 0;
-	for(;;) {
-		u_int len;
+	while (num_req > 0 || max_req > 0) {
 		char *data;
+		u_int len;
 
-		id = expected_id = msg_id++;
-
-		buffer_clear(&msg);
-		buffer_put_char(&msg, SSH2_FXP_READ);
-		buffer_put_int(&msg, id);
-		buffer_put_string(&msg, handle, handle_len);
-		buffer_put_int64(&msg, offset);
-		buffer_put_int(&msg, COPY_SIZE);
-		send_msg(fd_out, &msg);
-		debug3("Sent message SSH2_FXP_READ I:%d O:%llu S:%u",
-		    id, (u_int64_t)offset, COPY_SIZE);
+		/* Send some more requests */
+		while (num_req < max_req) {
+			req[num_req].id = msg_id++;
+			req[num_req].len = COPY_SIZE;
+			req[num_req].offset = offset;
+			offset += req[num_req].len;
+			num_req++;
+			send_request(fd_out, handle, handle_len, SSH2_FXP_READ,
+				     &req[num_req - 1], &msg);
+		}
 
 		buffer_clear(&msg);
-
 		get_msg(fd_in, &msg);
 		type = buffer_get_char(&msg);
 		id = buffer_get_int(&msg);
 		debug3("Received reply T:%d I:%d", type, id);
-		if (id != expected_id)
-			fatal("ID mismatch (%d != %d)", id, expected_id);
-		if (type == SSH2_FXP_STATUS) {
+		reply = find_request(req, num_req, id);
+
+		switch (type) {
+		case SSH2_FXP_STATUS:
 			status = buffer_get_int(&msg);
+			if (status != SSH2_FX_EOF)
+				read_error = 1;
+			max_req = 0;
+			remove_request(req, &num_req, reply);
+			break;
+		case SSH2_FXP_DATA:
+			data = buffer_get_string(&msg, &len);
+			if (len > req[reply].len)
+				fatal("Received more data than asked for "
+				      "%d > %d", len, req[reply].len);
+
+			debug3("In read loop, got %d offset %llu",
+			       len, req[reply].offset);
+			if ((lseek(local_fd, req[reply].offset, SEEK_SET) == -1 ||
+			     atomicio(write, local_fd, data, len) != len) &&
+			    !write_error) {
+				write_errno = errno;
+				write_error = 1;
+				max_req = 0;
+			}
+			xfree(data);
 
-			if (status == SSH2_FX_EOF)
-				break;
+			if (len == req[reply].len)
+				remove_request(req, &num_req, reply);
 			else {
-				error("Couldn't read from remote "
-				    "file \"%s\" : %s", remote_path,
-				     fx2txt(status));
-				do_close(fd_in, fd_out, handle, handle_len);
-				goto done;
+				/* Resend the request for the missing data */
+				req[reply].id = msg_id++;
+				req[reply].len -= len;
+				req[reply].offset += len;
+				send_request(fd_out, handle, handle_len,
+					     SSH2_FXP_READ, &req[reply], &msg);
 			}
-		} else if (type != SSH2_FXP_DATA) {
+			if (max_req > 0 && max_req < OVERLAPPING_REQUESTS + 1)
+				++max_req;
+			break;
+		default:
 			fatal("Expected SSH2_FXP_DATA(%d) packet, got %d",
-			    SSH2_FXP_DATA, type);
-		}
-
-		data = buffer_get_string(&msg, &len);
-		if (len > COPY_SIZE)
-			fatal("Received more data than asked for %d > %d",
-			    len, COPY_SIZE);
-
-		debug3("In read loop, got %d offset %llu", len,
-		    (u_int64_t)offset);
-		if (atomicio(write, local_fd, data, len) != len) {
-			error("Couldn't write to \"%s\": %s", local_path,
-			    strerror(errno));
-			do_close(fd_in, fd_out, handle, handle_len);
-			status = -1;
-			xfree(data);
-			goto done;
+			      SSH2_FXP_DATA, type);
 		}
-
-		offset += len;
-		xfree(data);
 	}
-	status = do_close(fd_in, fd_out, handle, handle_len);
 
-	/* Override umask and utimes if asked */
+	if (read_error) {
+	    error("Couldn't read from remote "
+		  "file \"%s\" : %s", remote_path,
+		  fx2txt(status));
+	    do_close(fd_in, fd_out, handle, handle_len);
+	} else if (write_error) {
+	    error("Couldn't write to \"%s\": %s", local_path,
+		  strerror(write_errno));
+	    status = -1;
+	    do_close(fd_in, fd_out, handle, handle_len);
+	} else {
+		status = do_close(fd_in, fd_out, handle, handle_len);
+
+		/* Override umask and utimes if asked */
 #ifdef HAVE_FCHMOD
-	if (pflag && fchmod(local_fd, mode) == -1)
+		if (pflag && fchmod(local_fd, mode) == -1)
 #else 
-	if (pflag && chmod(local_path, mode) == -1)
+		if (pflag && chmod(local_path, mode) == -1)
 #endif /* HAVE_FCHMOD */
-		error("Couldn't set mode on \"%s\": %s", local_path,
-		    strerror(errno));
-	if (pflag && (a->flags & SSH2_FILEXFER_ATTR_ACMODTIME)) {
-		struct timeval tv[2];
-		tv[0].tv_sec = a->atime;
-		tv[1].tv_sec = a->mtime;
-		tv[0].tv_usec = tv[1].tv_usec = 0;
-		if (utimes(local_path, tv) == -1)
-			error("Can't set times on \"%s\": %s", local_path,
-			    strerror(errno));
+			error("Couldn't set mode on \"%s\": %s", local_path,
+			      strerror(errno));
+		if (pflag && (a->flags & SSH2_FILEXFER_ATTR_ACMODTIME)) {
+			struct timeval tv[2];
+			tv[0].tv_sec = a->atime;
+			tv[1].tv_sec = a->mtime;
+			tv[0].tv_usec = tv[1].tv_usec = 0;
+			if (utimes(local_path, tv) == -1)
+				error("Can't set times on \"%s\": %s",
+				      local_path, strerror(errno));
+		}
 	}
-
-done:
 	close(local_fd);
 	buffer_free(&msg);
 	xfree(handle);
@@ -825,6 +891,8 @@
 	struct stat sb;
 	Attrib a;
 	int status;
+	u_int32_t startid;
+	u_int32_t ackid;
 
 	if ((local_fd = open(local_path, O_RDONLY, 0)) == -1) {
 		error("Couldn't open local file \"%s\" for reading: %s",
@@ -866,6 +934,8 @@
 		return(-1);
 	}
 
+	startid = ackid = id + 1;
+
 	/* Read from local and write to remote */
 	offset = 0;
 	for(;;) {
@@ -883,29 +953,34 @@
 		if (len == -1)
 			fatal("Couldn't read from \"%s\": %s", local_path,
 			    strerror(errno));
-		if (len == 0)
-			break;
 
-		buffer_clear(&msg);
-		buffer_put_char(&msg, SSH2_FXP_WRITE);
-		buffer_put_int(&msg, ++id);
-		buffer_put_string(&msg, handle, handle_len);
-		buffer_put_int64(&msg, offset);
-		buffer_put_string(&msg, data, len);
-		send_msg(fd_out, &msg);
-		debug3("Sent message SSH2_FXP_WRITE I:%d O:%llu S:%u",
-		    id, (u_int64_t)offset, len);
+		if (len != 0) {
+			buffer_clear(&msg);
+			buffer_put_char(&msg, SSH2_FXP_WRITE);
+			buffer_put_int(&msg, ++id);
+			buffer_put_string(&msg, handle, handle_len);
+			buffer_put_int64(&msg, offset);
+			buffer_put_string(&msg, data, len);
+			send_msg(fd_out, &msg);
+			debug3("Sent message SSH2_FXP_WRITE I:%d O:%llu S:%u",
+			       id, (u_int64_t)offset, len);
+		} else if ( id < ackid )
+			break;
 
-		status = get_status(fd_in, id);
-		if (status != SSH2_FX_OK) {
-			error("Couldn't write to remote file \"%s\": %s",
-			    remote_path, fx2txt(status));
-			do_close(fd_in, fd_out, handle, handle_len);
-			close(local_fd);
-			goto done;
+		if (id == startid || len == 0 ||
+		    id - ackid >= OVERLAPPING_REQUESTS) {
+			status = get_status(fd_in, ackid);
+			if (status != SSH2_FX_OK) {
+				error("Couldn't write to remote file \"%s\": %s",
+				      remote_path, fx2txt(status));
+				do_close(fd_in, fd_out, handle, handle_len);
+				close(local_fd);
+				goto done;
+			}
+			debug3("In write loop, got %d offset %llu", len,
+			       (u_int64_t)offset);
+			++ackid;
 		}
-		debug3("In write loop, got %d offset %llu", len,
-		    (u_int64_t)offset);
 
 		offset += len;
 	}
-------------- next part --------------
diff -ru openssh-3.0.2p1.orig/packet.c openssh-3.0.2p1.nodelay/packet.c
--- openssh-3.0.2p1.orig/packet.c	Mon Nov 12 01:07:58 2001
+++ openssh-3.0.2p1.nodelay/packet.c	Thu Jan 17 20:40:47 2002
@@ -1180,7 +1180,6 @@
 	int lowdelay = IPTOS_LOWDELAY;
 	int throughput = IPTOS_THROUGHPUT;
 #endif
-	int on = 1;
 
 	if (called)
 		return;
@@ -1198,7 +1197,7 @@
 	if (interactive) {
 		/*
 		 * Set IP options for an interactive connection.  Use
-		 * IPTOS_LOWDELAY and TCP_NODELAY.
+		 * IPTOS_LOWDELAY.
 		 */
 #if defined(IP_TOS) && !defined(IP_TOS_IS_BROKEN)
 		if (packet_connection_is_ipv4()) {
@@ -1208,9 +1207,6 @@
 				    strerror(errno));
 		}
 #endif
-		if (setsockopt(connection_in, IPPROTO_TCP, TCP_NODELAY, (void *) &on,
-		    sizeof(on)) < 0)
-			error("setsockopt TCP_NODELAY: %.100s", strerror(errno));
 	} else if (packet_connection_is_ipv4()) {
 		/*
 		 * Set IP options for a non-interactive connection.  Use
diff -ru openssh-3.0.2p1.orig/sshconnect.c openssh-3.0.2p1.nodelay/sshconnect.c
--- openssh-3.0.2p1.orig/sshconnect.c	Wed Oct 10 07:07:45 2001
+++ openssh-3.0.2p1.nodelay/sshconnect.c	Thu Jan 17 20:09:11 2002
@@ -370,6 +370,7 @@
 	linger.l_onoff = 1;
 	linger.l_linger = 5;
 	setsockopt(sock, SOL_SOCKET, SO_LINGER, (void *)&linger, sizeof(linger));
+	setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (void *) &on, sizeof(on));
 
 	/* Set keepalives if requested. */
 	if (options.keepalives &&
diff -ru openssh-3.0.2p1.orig/sshd.c openssh-3.0.2p1.nodelay/sshd.c
--- openssh-3.0.2p1.orig/sshd.c	Mon Nov 12 01:07:12 2001
+++ openssh-3.0.2p1.nodelay/sshd.c	Thu Jan 17 20:42:24 2002
@@ -1118,6 +1118,7 @@
 	linger.l_onoff = 1;
 	linger.l_linger = 5;
 	setsockopt(sock_in, SOL_SOCKET, SO_LINGER, (void *) &linger, sizeof(linger));
+	setsockopt(sock_in, IPPROTO_TCP, TCP_NODELAY, (void *) &on, sizeof(on));
 
 	/* Set keepalives if requested. */
 	if (options.keepalives &&
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sftp-bench.tar.gz
Type: application/x-gzip
Size: 11954 bytes
Desc: 
Url : http://lists.mindrot.org/pipermail/openssh-unix-dev/attachments/20020121/da23d17c/attachment.bin 


More information about the openssh-unix-dev mailing list