[PATCH] Using TCP_NODELAY unconditionally
Tobias Ringstrom
tori at ringstrom.mine.nu
Mon Jan 21 10:12:57 EST 2002
On Sun, 20 Jan 2002, Kevin Steves wrote:
> On Sun, 20 Jan 2002, Tobias Ringstrom wrote:
> :Please let me know if you still want a tcpdump! I cannot produce one
> :right now because one of my computers is busy, and I need to sleep.
>
> yes, i would like to see analysis before we change anything. also, among
> other things, see: http://citeseer.nj.nec.com/minshall99application.html
Alright - I only hope you have the stamina to examine my data now that
I've worked so hard to produce it... :-)
The test is to transfer the file gcc-3.0.2.tar.gz from boris (PII-233,
sftp server) to igor (Athlon-1000, sftp client), both running Linux
2.4.17. The file is cached at the server and writted to /dev/null on the
client. The network is an otherwise idle 100 Mbit full duplex switched
ethernet. Both machines are mostly idle. I tried very hard not to mix
patches/tests by compiling all versions, and installing them in separate
directories. I also inspected the installed config files, and run strace
(c.f truss) to be sure that the right binaries was used. I've run four
tests:
1. sftp
No TCP_NODELAY and no overlapped requests (unpatched 3.0.2p1)
2. sftp.nodelay
TCP_NODELAY but no overlapped requests
3. sftp.overlap
No TCP_NODELAY but overlapped requests
4. TCP_NODELAY and overlapped requests
For the overlapped tests, the overlap is 1, i.e. two active read requests
at any time. The following information is collected:
1. ifstat (my own utility, running on the Athlon (sftp client)
KiBi - kilobytes in per second
KiBo - kilobytes out per second
pi - packets in per second
po - packets out per second
2. vmstat on both boris and igor (one second interval)
3. A one second snippet of tcpdump data from both sides (steady state)
(full binary tcpdumps available if needed)
In this test, the network is not the limiting factor, and it is a very low
latency link. The sftp server boris is the bottle-neck.
As can be seen, either overlapping requests _or_ TCP_NODELAY gives good
performance in this test.
When running on the local interface on the Athlon, TCP_NODELAY gives much
better performance than one overlapping request. Increasing the number of
overlapping requests to something large (10) achieves almost the same
performance as with TCP_NODELAY. With TCP_NODELAY, overlapping requests
does not make a noticable difference.
I did most of my previous testing on the local interface which led me to
believe that the nodelay patch needed to go in first, but from this test
run, I conclude that the overlap patch works very well on its own, and can
be applied at any time. I'm guessing that it will take some time to reach
a conclusion regarding the Nagle issue...
I've also attached the patches I used for anyone to play with (or apply to
the CVS tree... :-)
/Tobias
-------------- next part --------------
diff -ru openssh-3.0.2p1.orig/sftp-client.c openssh-3.0.2p1.sftp/sftp-client.c
--- openssh-3.0.2p1.orig/sftp-client.c Wed Jul 18 17:45:45 2001
+++ openssh-3.0.2p1.sftp/sftp-client.c Sun Jan 6 23:13:44 2002
@@ -45,6 +45,15 @@
/* How much data to read/write at at time during copies */
/* XXX: what should this be? */
#define COPY_SIZE 8192
+/* Maximum number of overlapping requests */
+#define OVERLAPPING_REQUESTS 1
+
+/* A read/write request */
+struct request {
+ u_int id;
+ u_int len;
+ u_int64_t offset;
+};
/* Message ID */
static u_int msg_id = 1;
@@ -215,6 +224,44 @@
return(a);
}
+static int
+find_request(struct request *rq, int num, u_int id)
+{
+ int i;
+
+ for (i = 0; i < num; ++i) {
+ if (rq[i].id == id)
+ break;
+ }
+
+ if (i == num)
+ fatal("Request ID mismatch (%d)", id);
+
+ return i;
+}
+
+static void
+remove_request(struct request *rq, int *num, int i)
+{
+ memmove(rq + i, rq + i + 1, (*num - i - 1) * sizeof(struct request));
+ --*num;
+}
+
+static void
+send_request(int fd, const char *handle, u_int handle_len, int type,
+ const struct request *rq, Buffer *m)
+{
+ buffer_clear(m);
+ buffer_put_char(m, SSH2_FXP_READ);
+ buffer_put_int(m, rq->id);
+ buffer_put_string(m, handle, handle_len);
+ buffer_put_int64(m, rq->offset);
+ buffer_put_int(m, rq->len);
+ send_msg(fd, m);
+ debug3("Sent message SSH2_FXP_READ I:%d O:%llu S:%u",
+ rq->id, rq->offset, rq->len);
+}
+
int
do_init(int fd_in, int fd_out)
{
@@ -674,12 +721,15 @@
int pflag)
{
int local_fd;
- u_int expected_id, handle_len, mode, type, id;
+ u_int handle_len, mode, type, id;
u_int64_t offset;
char *handle;
Buffer msg;
Attrib junk, *a;
int status;
+ struct request req[OVERLAPPING_REQUESTS + 1];
+ int num_req = 0, max_req = 1, reply;
+ int write_error = 0, read_error = 0, write_errno;
a = do_stat(fd_in, fd_out, remote_path, 0);
if (a == NULL)
@@ -726,87 +776,103 @@
/* Read from remote and write to local */
offset = 0;
- for(;;) {
- u_int len;
+ while (num_req > 0 || max_req > 0) {
char *data;
+ u_int len;
- id = expected_id = msg_id++;
-
- buffer_clear(&msg);
- buffer_put_char(&msg, SSH2_FXP_READ);
- buffer_put_int(&msg, id);
- buffer_put_string(&msg, handle, handle_len);
- buffer_put_int64(&msg, offset);
- buffer_put_int(&msg, COPY_SIZE);
- send_msg(fd_out, &msg);
- debug3("Sent message SSH2_FXP_READ I:%d O:%llu S:%u",
- id, (u_int64_t)offset, COPY_SIZE);
+ /* Send some more requests */
+ while (num_req < max_req) {
+ req[num_req].id = msg_id++;
+ req[num_req].len = COPY_SIZE;
+ req[num_req].offset = offset;
+ offset += req[num_req].len;
+ num_req++;
+ send_request(fd_out, handle, handle_len, SSH2_FXP_READ,
+ &req[num_req - 1], &msg);
+ }
buffer_clear(&msg);
-
get_msg(fd_in, &msg);
type = buffer_get_char(&msg);
id = buffer_get_int(&msg);
debug3("Received reply T:%d I:%d", type, id);
- if (id != expected_id)
- fatal("ID mismatch (%d != %d)", id, expected_id);
- if (type == SSH2_FXP_STATUS) {
+ reply = find_request(req, num_req, id);
+
+ switch (type) {
+ case SSH2_FXP_STATUS:
status = buffer_get_int(&msg);
+ if (status != SSH2_FX_EOF)
+ read_error = 1;
+ max_req = 0;
+ remove_request(req, &num_req, reply);
+ break;
+ case SSH2_FXP_DATA:
+ data = buffer_get_string(&msg, &len);
+ if (len > req[reply].len)
+ fatal("Received more data than asked for "
+ "%d > %d", len, req[reply].len);
+
+ debug3("In read loop, got %d offset %llu",
+ len, req[reply].offset);
+ if ((lseek(local_fd, req[reply].offset, SEEK_SET) == -1 ||
+ atomicio(write, local_fd, data, len) != len) &&
+ !write_error) {
+ write_errno = errno;
+ write_error = 1;
+ max_req = 0;
+ }
+ xfree(data);
- if (status == SSH2_FX_EOF)
- break;
+ if (len == req[reply].len)
+ remove_request(req, &num_req, reply);
else {
- error("Couldn't read from remote "
- "file \"%s\" : %s", remote_path,
- fx2txt(status));
- do_close(fd_in, fd_out, handle, handle_len);
- goto done;
+ /* Resend the request for the missing data */
+ req[reply].id = msg_id++;
+ req[reply].len -= len;
+ req[reply].offset += len;
+ send_request(fd_out, handle, handle_len,
+ SSH2_FXP_READ, &req[reply], &msg);
}
- } else if (type != SSH2_FXP_DATA) {
+ if (max_req > 0 && max_req < OVERLAPPING_REQUESTS + 1)
+ ++max_req;
+ break;
+ default:
fatal("Expected SSH2_FXP_DATA(%d) packet, got %d",
- SSH2_FXP_DATA, type);
- }
-
- data = buffer_get_string(&msg, &len);
- if (len > COPY_SIZE)
- fatal("Received more data than asked for %d > %d",
- len, COPY_SIZE);
-
- debug3("In read loop, got %d offset %llu", len,
- (u_int64_t)offset);
- if (atomicio(write, local_fd, data, len) != len) {
- error("Couldn't write to \"%s\": %s", local_path,
- strerror(errno));
- do_close(fd_in, fd_out, handle, handle_len);
- status = -1;
- xfree(data);
- goto done;
+ SSH2_FXP_DATA, type);
}
-
- offset += len;
- xfree(data);
}
- status = do_close(fd_in, fd_out, handle, handle_len);
- /* Override umask and utimes if asked */
+ if (read_error) {
+ error("Couldn't read from remote "
+ "file \"%s\" : %s", remote_path,
+ fx2txt(status));
+ do_close(fd_in, fd_out, handle, handle_len);
+ } else if (write_error) {
+ error("Couldn't write to \"%s\": %s", local_path,
+ strerror(write_errno));
+ status = -1;
+ do_close(fd_in, fd_out, handle, handle_len);
+ } else {
+ status = do_close(fd_in, fd_out, handle, handle_len);
+
+ /* Override umask and utimes if asked */
#ifdef HAVE_FCHMOD
- if (pflag && fchmod(local_fd, mode) == -1)
+ if (pflag && fchmod(local_fd, mode) == -1)
#else
- if (pflag && chmod(local_path, mode) == -1)
+ if (pflag && chmod(local_path, mode) == -1)
#endif /* HAVE_FCHMOD */
- error("Couldn't set mode on \"%s\": %s", local_path,
- strerror(errno));
- if (pflag && (a->flags & SSH2_FILEXFER_ATTR_ACMODTIME)) {
- struct timeval tv[2];
- tv[0].tv_sec = a->atime;
- tv[1].tv_sec = a->mtime;
- tv[0].tv_usec = tv[1].tv_usec = 0;
- if (utimes(local_path, tv) == -1)
- error("Can't set times on \"%s\": %s", local_path,
- strerror(errno));
+ error("Couldn't set mode on \"%s\": %s", local_path,
+ strerror(errno));
+ if (pflag && (a->flags & SSH2_FILEXFER_ATTR_ACMODTIME)) {
+ struct timeval tv[2];
+ tv[0].tv_sec = a->atime;
+ tv[1].tv_sec = a->mtime;
+ tv[0].tv_usec = tv[1].tv_usec = 0;
+ if (utimes(local_path, tv) == -1)
+ error("Can't set times on \"%s\": %s",
+ local_path, strerror(errno));
+ }
}
-
-done:
close(local_fd);
buffer_free(&msg);
xfree(handle);
@@ -825,6 +891,8 @@
struct stat sb;
Attrib a;
int status;
+ u_int32_t startid;
+ u_int32_t ackid;
if ((local_fd = open(local_path, O_RDONLY, 0)) == -1) {
error("Couldn't open local file \"%s\" for reading: %s",
@@ -866,6 +934,8 @@
return(-1);
}
+ startid = ackid = id + 1;
+
/* Read from local and write to remote */
offset = 0;
for(;;) {
@@ -883,29 +953,34 @@
if (len == -1)
fatal("Couldn't read from \"%s\": %s", local_path,
strerror(errno));
- if (len == 0)
- break;
- buffer_clear(&msg);
- buffer_put_char(&msg, SSH2_FXP_WRITE);
- buffer_put_int(&msg, ++id);
- buffer_put_string(&msg, handle, handle_len);
- buffer_put_int64(&msg, offset);
- buffer_put_string(&msg, data, len);
- send_msg(fd_out, &msg);
- debug3("Sent message SSH2_FXP_WRITE I:%d O:%llu S:%u",
- id, (u_int64_t)offset, len);
+ if (len != 0) {
+ buffer_clear(&msg);
+ buffer_put_char(&msg, SSH2_FXP_WRITE);
+ buffer_put_int(&msg, ++id);
+ buffer_put_string(&msg, handle, handle_len);
+ buffer_put_int64(&msg, offset);
+ buffer_put_string(&msg, data, len);
+ send_msg(fd_out, &msg);
+ debug3("Sent message SSH2_FXP_WRITE I:%d O:%llu S:%u",
+ id, (u_int64_t)offset, len);
+ } else if ( id < ackid )
+ break;
- status = get_status(fd_in, id);
- if (status != SSH2_FX_OK) {
- error("Couldn't write to remote file \"%s\": %s",
- remote_path, fx2txt(status));
- do_close(fd_in, fd_out, handle, handle_len);
- close(local_fd);
- goto done;
+ if (id == startid || len == 0 ||
+ id - ackid >= OVERLAPPING_REQUESTS) {
+ status = get_status(fd_in, ackid);
+ if (status != SSH2_FX_OK) {
+ error("Couldn't write to remote file \"%s\": %s",
+ remote_path, fx2txt(status));
+ do_close(fd_in, fd_out, handle, handle_len);
+ close(local_fd);
+ goto done;
+ }
+ debug3("In write loop, got %d offset %llu", len,
+ (u_int64_t)offset);
+ ++ackid;
}
- debug3("In write loop, got %d offset %llu", len,
- (u_int64_t)offset);
offset += len;
}
-------------- next part --------------
diff -ru openssh-3.0.2p1.orig/packet.c openssh-3.0.2p1.nodelay/packet.c
--- openssh-3.0.2p1.orig/packet.c Mon Nov 12 01:07:58 2001
+++ openssh-3.0.2p1.nodelay/packet.c Thu Jan 17 20:40:47 2002
@@ -1180,7 +1180,6 @@
int lowdelay = IPTOS_LOWDELAY;
int throughput = IPTOS_THROUGHPUT;
#endif
- int on = 1;
if (called)
return;
@@ -1198,7 +1197,7 @@
if (interactive) {
/*
* Set IP options for an interactive connection. Use
- * IPTOS_LOWDELAY and TCP_NODELAY.
+ * IPTOS_LOWDELAY.
*/
#if defined(IP_TOS) && !defined(IP_TOS_IS_BROKEN)
if (packet_connection_is_ipv4()) {
@@ -1208,9 +1207,6 @@
strerror(errno));
}
#endif
- if (setsockopt(connection_in, IPPROTO_TCP, TCP_NODELAY, (void *) &on,
- sizeof(on)) < 0)
- error("setsockopt TCP_NODELAY: %.100s", strerror(errno));
} else if (packet_connection_is_ipv4()) {
/*
* Set IP options for a non-interactive connection. Use
diff -ru openssh-3.0.2p1.orig/sshconnect.c openssh-3.0.2p1.nodelay/sshconnect.c
--- openssh-3.0.2p1.orig/sshconnect.c Wed Oct 10 07:07:45 2001
+++ openssh-3.0.2p1.nodelay/sshconnect.c Thu Jan 17 20:09:11 2002
@@ -370,6 +370,7 @@
linger.l_onoff = 1;
linger.l_linger = 5;
setsockopt(sock, SOL_SOCKET, SO_LINGER, (void *)&linger, sizeof(linger));
+ setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (void *) &on, sizeof(on));
/* Set keepalives if requested. */
if (options.keepalives &&
diff -ru openssh-3.0.2p1.orig/sshd.c openssh-3.0.2p1.nodelay/sshd.c
--- openssh-3.0.2p1.orig/sshd.c Mon Nov 12 01:07:12 2001
+++ openssh-3.0.2p1.nodelay/sshd.c Thu Jan 17 20:42:24 2002
@@ -1118,6 +1118,7 @@
linger.l_onoff = 1;
linger.l_linger = 5;
setsockopt(sock_in, SOL_SOCKET, SO_LINGER, (void *) &linger, sizeof(linger));
+ setsockopt(sock_in, IPPROTO_TCP, TCP_NODELAY, (void *) &on, sizeof(on));
/* Set keepalives if requested. */
if (options.keepalives &&
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sftp-bench.tar.gz
Type: application/x-gzip
Size: 11954 bytes
Desc:
Url : http://lists.mindrot.org/pipermail/openssh-unix-dev/attachments/20020121/da23d17c/attachment.bin
More information about the openssh-unix-dev
mailing list