[PATCH] Added NoDelay config option and nodelay subsystem option

Tobias Ringstrom tori at ringstrom.mine.nu
Tue Jan 29 19:29:37 EST 2002


On Mon, 28 Jan 2002, mouring wrote:

> [..]
> > My overlapping reads/writes for sftp seems to be ignored at the moment.
> > 
> > I'm not happy.
> > 
> > /Tobias
> > 
> One thing you must learn Tobias, unless things are clear cut as 'THE WAY'
> to go there is (and more than likely ever will be) a long period in time
> from patch conception to inclusion.  And this is one in which is not
> 100% clear cut.  Between NoDelay, COPY_SIZE, how many overlapping
> segments, and if we should let the user muck with them are all things that
> will affect how we accept the patch.

Of course I understand that it may take some time.  What I do not
understand is whether a long silence after the first flurry of comments to
a patch (or a suggested change) means that the patch is under
consideration, if it was deemed unacceptable, or if it was simply
forgotten.

One other thing is that since I did not know who the maintainers were
(there seem to be a whole bunch), I did not know what "suggestions" were
"masked vetos".

> I'm still waiting for you to provide any information on upping COPY_SIZE
> to 32k and retesting with multiple overlapping.  I don't remember seeing
> such an email messages.  

Looking through my mailboxes, I cannot find such a request, and I do not
remember one either.  The closest I can find is "Also, you may want to
check out pushing BLOCKSIZE (think that is right, not looking at the code)
to 4092 which is the max size the RFC states. This may change your
testing."

This comment was made invalid IMHO after the Nagle interaction problem was
found.

> However, this should also help lower the amount of overlapping request.  
> As a result lower the cost of a missing request. I don't want to get
> into the case where sftp thrashes due to multiple pending requests out
> and a bad link that drops 2 out of 5 packets.  Which would not only
> cause large amount of seeks() but also could trigger edge cases within
> the code for binary integerity.

First of all, we only need more than one overlapping request if the
delay-bandwidth product of the link is larger than the block size.  I do
not have any figures for a wide number of links.  The reason why I thought
I needed more was the interaction with the Nagle algorithm. The Nagle
problem is orthogonal, but needs to be resolved.  It seems to be very hard
to agree on a solution, though.

The latest version of the overlap patch (which is several weeks old now) 
contains code to reduce the request size if the server splits a request.  
With this mechanism, it should be safe to use a larger initial request 
size.

> I don't have the patch infront of me..but there was some bits I did not
> care for if I remember right.

Please reconsider the patch in light of the nodelay findings.  (And if you 
do benchmark it, please use TCP_NODELAY in both ends.)

/Tobias
-------------- next part --------------
diff -ru openssh-3.0.2p1.orig/sftp-client.c openssh-3.0.2p1.sftp/sftp-client.c
--- openssh-3.0.2p1.orig/sftp-client.c	Wed Jul 18 17:45:45 2001
+++ openssh-3.0.2p1.sftp/sftp-client.c	Sun Jan  6 23:13:44 2002
@@ -45,6 +45,15 @@
 /* How much data to read/write at at time during copies */
 /* XXX: what should this be? */
 #define COPY_SIZE	8192
+/* Maximum number of overlapping requests */
+#define OVERLAPPING_REQUESTS 1
+
+/* A read/write request */
+struct request {
+	u_int id;
+	u_int len;
+	u_int64_t offset;
+};
 
 /* Message ID */
 static u_int msg_id = 1;
@@ -215,6 +224,44 @@
 	return(a);
 }
 
+static int
+find_request(struct request *rq, int num, u_int id)
+{
+	int i;
+
+	for (i = 0; i < num; ++i) {
+		if (rq[i].id == id)
+			break;
+	}
+
+	if (i == num)
+		fatal("Request ID mismatch (%d)", id);
+
+	return i;
+}
+
+static void
+remove_request(struct request *rq, int *num, int i)
+{
+	memmove(rq + i, rq + i + 1, (*num - i - 1) * sizeof(struct request));
+	--*num;
+}
+
+static void
+send_request(int fd, const char *handle, u_int handle_len, int type,
+	     const struct request *rq, Buffer *m)
+{
+    buffer_clear(m);
+    buffer_put_char(m, SSH2_FXP_READ);
+    buffer_put_int(m, rq->id);
+    buffer_put_string(m, handle, handle_len);
+    buffer_put_int64(m, rq->offset);
+    buffer_put_int(m, rq->len);
+    send_msg(fd, m);
+    debug3("Sent message SSH2_FXP_READ I:%d O:%llu S:%u",
+	   rq->id, rq->offset, rq->len);
+}
+
 int
 do_init(int fd_in, int fd_out)
 {
@@ -674,12 +721,15 @@
     int pflag)
 {
 	int local_fd;
-	u_int expected_id, handle_len, mode, type, id;
+	u_int handle_len, mode, type, id;
 	u_int64_t offset;
 	char *handle;
 	Buffer msg;
 	Attrib junk, *a;
 	int status;
+	struct request req[OVERLAPPING_REQUESTS + 1];
+	int num_req = 0, max_req = 1, reply;
+	int write_error = 0, read_error = 0, write_errno;
 
 	a = do_stat(fd_in, fd_out, remote_path, 0);
 	if (a == NULL)
@@ -726,87 +776,103 @@
 
 	/* Read from remote and write to local */
 	offset = 0;
-	for(;;) {
-		u_int len;
+	while (num_req > 0 || max_req > 0) {
 		char *data;
+		u_int len;
 
-		id = expected_id = msg_id++;
-
-		buffer_clear(&msg);
-		buffer_put_char(&msg, SSH2_FXP_READ);
-		buffer_put_int(&msg, id);
-		buffer_put_string(&msg, handle, handle_len);
-		buffer_put_int64(&msg, offset);
-		buffer_put_int(&msg, COPY_SIZE);
-		send_msg(fd_out, &msg);
-		debug3("Sent message SSH2_FXP_READ I:%d O:%llu S:%u",
-		    id, (u_int64_t)offset, COPY_SIZE);
+		/* Send some more requests */
+		while (num_req < max_req) {
+			req[num_req].id = msg_id++;
+			req[num_req].len = COPY_SIZE;
+			req[num_req].offset = offset;
+			offset += req[num_req].len;
+			num_req++;
+			send_request(fd_out, handle, handle_len, SSH2_FXP_READ,
+				     &req[num_req - 1], &msg);
+		}
 
 		buffer_clear(&msg);
-
 		get_msg(fd_in, &msg);
 		type = buffer_get_char(&msg);
 		id = buffer_get_int(&msg);
 		debug3("Received reply T:%d I:%d", type, id);
-		if (id != expected_id)
-			fatal("ID mismatch (%d != %d)", id, expected_id);
-		if (type == SSH2_FXP_STATUS) {
+		reply = find_request(req, num_req, id);
+
+		switch (type) {
+		case SSH2_FXP_STATUS:
 			status = buffer_get_int(&msg);
+			if (status != SSH2_FX_EOF)
+				read_error = 1;
+			max_req = 0;
+			remove_request(req, &num_req, reply);
+			break;
+		case SSH2_FXP_DATA:
+			data = buffer_get_string(&msg, &len);
+			if (len > req[reply].len)
+				fatal("Received more data than asked for "
+				      "%d > %d", len, req[reply].len);
+
+			debug3("In read loop, got %d offset %llu",
+			       len, req[reply].offset);
+			if ((lseek(local_fd, req[reply].offset, SEEK_SET) == -1 ||
+			     atomicio(write, local_fd, data, len) != len) &&
+			    !write_error) {
+				write_errno = errno;
+				write_error = 1;
+				max_req = 0;
+			}
+			xfree(data);
 
-			if (status == SSH2_FX_EOF)
-				break;
+			if (len == req[reply].len)
+				remove_request(req, &num_req, reply);
 			else {
-				error("Couldn't read from remote "
-				    "file \"%s\" : %s", remote_path,
-				     fx2txt(status));
-				do_close(fd_in, fd_out, handle, handle_len);
-				goto done;
+				/* Resend the request for the missing data */
+				req[reply].id = msg_id++;
+				req[reply].len -= len;
+				req[reply].offset += len;
+				send_request(fd_out, handle, handle_len,
+					     SSH2_FXP_READ, &req[reply], &msg);
 			}
-		} else if (type != SSH2_FXP_DATA) {
+			if (max_req > 0 && max_req < OVERLAPPING_REQUESTS + 1)
+				++max_req;
+			break;
+		default:
 			fatal("Expected SSH2_FXP_DATA(%d) packet, got %d",
-			    SSH2_FXP_DATA, type);
-		}
-
-		data = buffer_get_string(&msg, &len);
-		if (len > COPY_SIZE)
-			fatal("Received more data than asked for %d > %d",
-			    len, COPY_SIZE);
-
-		debug3("In read loop, got %d offset %llu", len,
-		    (u_int64_t)offset);
-		if (atomicio(write, local_fd, data, len) != len) {
-			error("Couldn't write to \"%s\": %s", local_path,
-			    strerror(errno));
-			do_close(fd_in, fd_out, handle, handle_len);
-			status = -1;
-			xfree(data);
-			goto done;
+			      SSH2_FXP_DATA, type);
 		}
-
-		offset += len;
-		xfree(data);
 	}
-	status = do_close(fd_in, fd_out, handle, handle_len);
 
-	/* Override umask and utimes if asked */
+	if (read_error) {
+	    error("Couldn't read from remote "
+		  "file \"%s\" : %s", remote_path,
+		  fx2txt(status));
+	    do_close(fd_in, fd_out, handle, handle_len);
+	} else if (write_error) {
+	    error("Couldn't write to \"%s\": %s", local_path,
+		  strerror(write_errno));
+	    status = -1;
+	    do_close(fd_in, fd_out, handle, handle_len);
+	} else {
+		status = do_close(fd_in, fd_out, handle, handle_len);
+
+		/* Override umask and utimes if asked */
 #ifdef HAVE_FCHMOD
-	if (pflag && fchmod(local_fd, mode) == -1)
+		if (pflag && fchmod(local_fd, mode) == -1)
 #else 
-	if (pflag && chmod(local_path, mode) == -1)
+		if (pflag && chmod(local_path, mode) == -1)
 #endif /* HAVE_FCHMOD */
-		error("Couldn't set mode on \"%s\": %s", local_path,
-		    strerror(errno));
-	if (pflag && (a->flags & SSH2_FILEXFER_ATTR_ACMODTIME)) {
-		struct timeval tv[2];
-		tv[0].tv_sec = a->atime;
-		tv[1].tv_sec = a->mtime;
-		tv[0].tv_usec = tv[1].tv_usec = 0;
-		if (utimes(local_path, tv) == -1)
-			error("Can't set times on \"%s\": %s", local_path,
-			    strerror(errno));
+			error("Couldn't set mode on \"%s\": %s", local_path,
+			      strerror(errno));
+		if (pflag && (a->flags & SSH2_FILEXFER_ATTR_ACMODTIME)) {
+			struct timeval tv[2];
+			tv[0].tv_sec = a->atime;
+			tv[1].tv_sec = a->mtime;
+			tv[0].tv_usec = tv[1].tv_usec = 0;
+			if (utimes(local_path, tv) == -1)
+				error("Can't set times on \"%s\": %s",
+				      local_path, strerror(errno));
+		}
 	}
-
-done:
 	close(local_fd);
 	buffer_free(&msg);
 	xfree(handle);
@@ -825,6 +891,8 @@
 	struct stat sb;
 	Attrib a;
 	int status;
+	u_int32_t startid;
+	u_int32_t ackid;
 
 	if ((local_fd = open(local_path, O_RDONLY, 0)) == -1) {
 		error("Couldn't open local file \"%s\" for reading: %s",
@@ -866,6 +934,8 @@
 		return(-1);
 	}
 
+	startid = ackid = id + 1;
+
 	/* Read from local and write to remote */
 	offset = 0;
 	for(;;) {
@@ -883,29 +953,34 @@
 		if (len == -1)
 			fatal("Couldn't read from \"%s\": %s", local_path,
 			    strerror(errno));
-		if (len == 0)
-			break;
 
-		buffer_clear(&msg);
-		buffer_put_char(&msg, SSH2_FXP_WRITE);
-		buffer_put_int(&msg, ++id);
-		buffer_put_string(&msg, handle, handle_len);
-		buffer_put_int64(&msg, offset);
-		buffer_put_string(&msg, data, len);
-		send_msg(fd_out, &msg);
-		debug3("Sent message SSH2_FXP_WRITE I:%d O:%llu S:%u",
-		    id, (u_int64_t)offset, len);
+		if (len != 0) {
+			buffer_clear(&msg);
+			buffer_put_char(&msg, SSH2_FXP_WRITE);
+			buffer_put_int(&msg, ++id);
+			buffer_put_string(&msg, handle, handle_len);
+			buffer_put_int64(&msg, offset);
+			buffer_put_string(&msg, data, len);
+			send_msg(fd_out, &msg);
+			debug3("Sent message SSH2_FXP_WRITE I:%d O:%llu S:%u",
+			       id, (u_int64_t)offset, len);
+		} else if ( id < ackid )
+			break;
 
-		status = get_status(fd_in, id);
-		if (status != SSH2_FX_OK) {
-			error("Couldn't write to remote file \"%s\": %s",
-			    remote_path, fx2txt(status));
-			do_close(fd_in, fd_out, handle, handle_len);
-			close(local_fd);
-			goto done;
+		if (id == startid || len == 0 ||
+		    id - ackid >= OVERLAPPING_REQUESTS) {
+			status = get_status(fd_in, ackid);
+			if (status != SSH2_FX_OK) {
+				error("Couldn't write to remote file \"%s\": %s",
+				      remote_path, fx2txt(status));
+				do_close(fd_in, fd_out, handle, handle_len);
+				close(local_fd);
+				goto done;
+			}
+			debug3("In write loop, got %d offset %llu", len,
+			       (u_int64_t)offset);
+			++ackid;
 		}
-		debug3("In write loop, got %d offset %llu", len,
-		    (u_int64_t)offset);
 
 		offset += len;
 	}


More information about the openssh-unix-dev mailing list