so-called-hang-on-exit

Jani Jaakkola jjaakkol at cs.helsinki.fi
Sun Aug 4 05:02:27 EST 2002


On Sat, 3 Aug 2002, Jani Jaakkola wrote:

> My suggestion is that instead of just closing the channel when the 
> session process dies, sshd should effort to empty the pty buffer before 
> closing the channel. This could be implemented so that 
> when the session process dies, sshd should read (and buffer) the pty 
> master data until it gets EAGAIN and then close the channel. All 
> pending output from dead session processes should allready be in the pty 
> buffer, so first EAGAIN means that all of it has been read processed.
> 
> I guess I will attempt to implement this, but it will take some time (I 
> will need to learn how session.c and serverloop.c work first).

OK, now I have implemented it. Currently only for protocol 2, but a 
similar thing can be implemented for protocol 1 (and the protocol 1 patch 
would be simpler).

Here is what it does (or should do; I have made mistakes before):

- Applies against openssh-3.4p1
- This patch only changes behavior of ssh protocol 2 sessions with a tty.
  Everything else should remain as it was.
- It does its job by reading all immediately available data from the
  pty when the pty session process dies. This should include all data
  from the pty session foreground processes.
- It can and will lose data from pty session background processes.
- It might still hang indefinetely, if the background processes keep
  producing more data than the sshd process can read and transfer to 
  client (so that pty buffer will never get completely empty). I consider 
  this a feature of this solution, not a bug.

Here is a way to test it. Running this nice and simple shell command:

$ ssh -2 -t localhost 'echo start; nohup sh -c "(sleep 10; echo hang-on-exit-bug) > `tty`" > /dev/null & sleep 1; echo stop'

Will take about ten seconds and give this kind of output, if you have the 
hang-on-exit bug:

<CLIP>
start
stop
hang-on-exit-bug
Connection to localhost closed.
</CLIP>

With Markus Friedls patch it will sometimes lose data and give output like 
this:

<CLIP>
start
Connection to localhost closed.
</CLIP>

Note that it might take many tries to actually lose the data and that with 
some environments it might never happen. But the race sure is there.

With my patch it should always give output like this in about 1 sec 
(and authentication delays):

<CLIP>
start
stop
Connection to localhost closed.
</CLIP>

The patch follows. It is also available from 
http://www.cs.helsinki.fi/u/jjaakkol/openssh-hang-on-exit.patch

- Jani

diff -bur openssh-3.4p1.orig/channels.c openssh-3.4p1/channels.c
--- openssh-3.4p1.orig/channels.c	Wed Jun 26 12:14:43 2002
+++ openssh-3.4p1/channels.c	Sat Aug  3 18:39:38 2002
@@ -245,6 +245,8 @@
 	buffer_init(&c->extended);
 	c->ostate = CHAN_OUTPUT_OPEN;
 	c->istate = CHAN_INPUT_OPEN;
+	c->read_last_input=0;
+	c->timeout_ms=UINT_MAX;
 	c->flags = 0;
 	channel_register_fds(c, rfd, wfd, efd, extusage, nonblock);
 	c->self = found;
@@ -1232,14 +1234,32 @@
 	char buf[16*1024];
 	int len;
 
+	/* If read_last_input is set, we ignore the reaset bit since
+	 * we want to catch EAGAIN. */
 	if (c->rfd != -1 &&
-	    FD_ISSET(c->rfd, readset)) {
+	    (c->read_last_input || FD_ISSET(c->rfd, readset)) ) {
 		len = read(c->rfd, buf, sizeof(buf));
-		if (len < 0 && (errno == EINTR || errno == EAGAIN))
+		if (len < 0) {
+			if ( errno == EINTR ) return 1;
+			/*
+			 * if channel->read_last_input is not set, 
+			 * EAGAIN means (as usual) that we try later again.
+			 * 
+			 * If channel->read_last_input is set, we
+			 * close the read channel on first EAGAIN.
+			 */
+			if ( errno == EAGAIN && !c->read_last_input) {
 			return 1;
+			}
+		}
 		if (len <= 0) {
 			debug("channel %d: read<=0 rfd %d len %d",
 			    c->self, c->rfd, len);
+			if (c->read_last_input) {
+				verbose("hang-on-exit protocol 2 workaround finished.");
+				c->read_last_input=0;
+				c->timeout_ms=0;
+			}
 			if (c->type != SSH_CHANNEL_OPEN) {
 				debug("channel %d: not open", c->self);
 				chan_mark_dead(c);
@@ -1507,7 +1527,8 @@
 }
 
 static void
-channel_handler(chan_fn *ftab[], fd_set * readset, fd_set * writeset)
+channel_handler(chan_fn *ftab[], fd_set * readset, fd_set * writeset,
+		u_int *timeout_milliseconds)
 {
 	static int did_init = 0;
 	int i;
@@ -1523,6 +1544,12 @@
 			continue;
 		if (ftab[c->type] != NULL)
 			(*ftab[c->type])(c, readset, writeset);
+		/* Check if this channel wants a timeout for some reason */
+		/* NOTE: for serverloop timeout_milliseconds==0 means 
+		 * no timeout set instead of zero millisecond timeout */
+		if (timeout_milliseconds && c->timeout_ms!=UINT_MAX &&
+		    (*timeout_milliseconds == 0 || c->timeout_ms<*timeout_milliseconds) )
+			*timeout_milliseconds=c->timeout_ms;
 		channel_garbage_collect(c);
 	}
 }
@@ -1530,9 +1557,11 @@
 /*
  * Allocate/update select bitmasks and add any bits relevant to channels in
  * select bitmasks.
+ * Also let channels adjust the max_time_milliseconds select() timeout
  */
 void
-channel_prepare_select(fd_set **readsetp, fd_set **writesetp, int *maxfdp,
+channel_prepare_select(fd_set **readsetp, fd_set **writesetp, 
+    u_int *max_time_milliseconds, int *maxfdp,
     int *nallocp, int rekeying)
 {
 	int n;
@@ -1552,7 +1581,8 @@
 	memset(*writesetp, 0, sz);
 
 	if (!rekeying)
-		channel_handler(channel_pre, *readsetp, *writesetp);
+		channel_handler(channel_pre, *readsetp, *writesetp, 
+				max_time_milliseconds);
 }
 
 /*
@@ -1562,7 +1592,7 @@
 void
 channel_after_select(fd_set * readset, fd_set * writeset)
 {
-	channel_handler(channel_post, readset, writeset);
+	channel_handler(channel_post, readset, writeset, NULL);
 }
 
 
diff -bur openssh-3.4p1.orig/channels.h openssh-3.4p1/channels.h
--- openssh-3.4p1.orig/channels.h	Wed Jun 26 02:17:37 2002
+++ openssh-3.4p1/channels.h	Sat Aug  3 17:20:19 2002
@@ -70,6 +70,16 @@
 	int     self;		/* my own channel identifier */
 	int     remote_id;	/* channel identifier for remote peer */
 	u_int   istate;		/* input from channel (state of receive half) */
+	/* read_last_input state is a hack to force the channel to read
+	 * all immediately available data from the input fd and close it
+	 * on first EAGAIN. This is needed to properly fix the 
+	 * "hang-on-exit bug".
+	 * XXX : maybe read_last_input should be merged to istate, but that
+	 *       is a larger hack. */
+	int     read_last_input;
+	/* Should this channel add a timeout to
+	 * select() loop. UINT_MAX if no timeout. */
+	u_int   timeout_ms;
 	u_int   ostate;		/* output to channel  (state of transmit half) */
 	int     flags;		/* close sent/rcvd */
 	int     rfd;		/* read fd */
@@ -180,7 +190,7 @@
 
 /* file descriptor handling (read/write) */
 
-void	 channel_prepare_select(fd_set **, fd_set **, int *, int*, int);
+void	 channel_prepare_select(fd_set **, fd_set **, u_int *, int *, int*, int);
 void     channel_after_select(fd_set *, fd_set *);
 void     channel_output_poll(void);
 
diff -bur openssh-3.4p1.orig/clientloop.c openssh-3.4p1/clientloop.c
--- openssh-3.4p1.orig/clientloop.c	Wed Jun 26 02:17:37 2002
+++ openssh-3.4p1/clientloop.c	Sat Aug  3 17:23:01 2002
@@ -322,7 +322,7 @@
     int *maxfdp, int *nallocp, int rekeying)
 {
 	/* Add any selections by the channel mechanism. */
-	channel_prepare_select(readsetp, writesetp, maxfdp, nallocp, rekeying);
+	channel_prepare_select(readsetp, writesetp, NULL, maxfdp, nallocp, rekeying);
 
 	if (!compat20) {
 		/* Read from the connection, unless our buffers are full. */
diff -bur openssh-3.4p1.orig/serverloop.c openssh-3.4p1/serverloop.c
--- openssh-3.4p1.orig/serverloop.c	Wed Jun 26 02:17:37 2002
+++ openssh-3.4p1/serverloop.c	Sat Aug  3 21:37:26 2002
@@ -261,7 +261,8 @@
 	}
 
 	/* Allocate and update select() masks for channel descriptors. */
-	channel_prepare_select(readsetp, writesetp, maxfdp, nallocp, 0);
+	channel_prepare_select(readsetp, writesetp, &max_time_milliseconds,
+			       maxfdp, nallocp, 0);
 
 	if (compat20) {
 #if 0
@@ -308,7 +309,8 @@
 	 * from it, then read as much as is available and exit.
 	 */
 	if (child_terminated && packet_not_very_much_data_to_write())
-		if (max_time_milliseconds == 0 || client_alive_scheduled)
+		if (max_time_milliseconds == 0 || 
+		    (client_alive_scheduled && max_time_milliseconds>100) )
 			max_time_milliseconds = 100;
 
 	if (max_time_milliseconds == 0)
diff -bur openssh-3.4p1.orig/session.c openssh-3.4p1/session.c
--- openssh-3.4p1.orig/session.c	Wed Jun 26 16:51:06 2002
+++ openssh-3.4p1/session.c	Sat Aug  3 21:41:03 2002
@@ -1835,11 +1835,27 @@
 	/*
 	 * emulate a write failure with 'chan_write_failed', nobody will be
 	 * interested in data we write.
-	 * Note that we must not call 'chan_read_failed', since there could
+	 * Note that for the non-pty case we must not call 'chan_read_failed',
+	 * since there could
 	 * be some more data waiting in the pipe.
 	 */
 	if (c->ostate != CHAN_OUTPUT_CLOSED)
 		chan_write_failed(c);
+	/* Check if we need the hang-on-exit bug workaround */
+	if (s->ttyfd != -1 && c->istate == CHAN_INPUT_OPEN) {
+		/* Set the channel read_last_input state so that we 
+		 * can empty the pty buffer, before forcing the read side 
+		 * closed.
+		 * XXX: this thing should probably be moved to channels.c,
+	         * to its own function.
+		 */
+		c->read_last_input=1;
+		/* XXX: Note that we cannot set the timeout to 0ms, since
+		 * serverloop interprets 0 as no timeout set instead of 
+		 * 0ms timeout */
+		c->timeout_ms=10;
+		verbose("hang-on-exit protocol 2 workaround applied");
+	}
 	s->chanid = -1;
 }
 




More information about the openssh-unix-dev mailing list