how to troubleshoot ssh multiplexing hanging issues?
Ken Chang
z4242814375 at gmail.com
Wed Sep 13 05:06:20 AEST 2017
list moderators: sorry, I previously sent the below message while my
gmail client was still in html mode. it should be in plain text mode
now.
As follow up to the message below. I caught a ssh mux process in the
early stages of the problem and got a very different strace output. it
is possible that previous strace was after the problem compounded on
itself and reached "too many open files". Here's an strace of the ssh
mux process when the problem starts, but is still in the early stages:
select(80, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382562195}) = 0
accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 80
getsockopt(80, SOL_SOCKET, SO_PEERCRED, {pid=20099, uid=1000,
gid=1000}, [12]) = 0
getuid() = 1000
fcntl(80, F_SETFD, FD_CLOEXEC) = 0
fcntl(80, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(80, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl(80, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382828691}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382855499}) = 0
select(81, [3 4 5 9 80], [80], NULL, NULL) = 2 (in [80], out [80])
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382924335}) = 0
read(80, "\0\0\0\10", 4) = 4
read(80, "\0\0\0\1\0\0\0\4", 8) = 8
write(80, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383147628}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383176998}) = 0
select(81, [3 4 5 9 80], [], NULL, NULL) = 1 (in [80])
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383240708}) = 0
read(80, "\0\0\0\10", 4) = 4
read(80, "\20\0\0\4\0\0\0\0", 8) = 8
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383331936}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383356457}) = 0
select(81, [3 4 5 9 80], [80], NULL, NULL) = 1 (out [80])
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383416936}) = 0
write(80, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383472208}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383496660}) = 0
select(81, [3 4 5 9 80], [], NULL, NULL) = 1 (in [80])
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383556519}) = 0
read(80, "\0\0\0J", 4) = 4
read(80, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"...,
74) = 74
recvmsg(80, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {81}}, msg_flags=0}, 0) = 1
recvmsg(80, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {82}}, msg_flags=0}, 0) = 1
recvmsg(80, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {83}}, msg_flags=0}, 0) = 1
ioctl(81, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(81, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(82, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(83, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
fcntl(81, F_SETFD, FD_CLOEXEC) = 0
fcntl(82, F_SETFD, FD_CLOEXEC) = 0
fcntl(83, F_SETFD, FD_CLOEXEC) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383965138}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383994722}) = 0
select(84, [3 4 5 9], [3], NULL, NULL) = 1 (out [3])
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 384069185}) = 0
write(3, "\0\0\0
\324\340\334\t\351p\fW^}\335\31m\2074\231\241\3\362\275)\33,r\211\26\370B"...,
52) = 52
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 384139111}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504866, 384163755}) = 0
select(84, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 618201367}) = 0
accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 84
getsockopt(84, SOL_SOCKET, SO_PEERCRED, {pid=21881, uid=1000,
gid=1000}, [12]) = 0
getuid() = 1000
fcntl(84, F_SETFD, FD_CLOEXEC) = 0
fcntl(84, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(84, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl(84, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
brk(0) = 0x7fa4ab18f000
brk(0x7fa4ab1b0000) = 0x7fa4ab1b0000
brk(0) = 0x7fa4ab1b0000
brk(0) = 0x7fa4ab1b0000
brk(0x7fa4ab1af000) = 0x7fa4ab1af000
brk(0) = 0x7fa4ab1af000
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 618828564}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 618871803}) = 0
select(85, [3 4 5 9 84], [84], NULL, NULL) = 2 (in [84], out [84])
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619030573}) = 0
read(84, "\0\0\0\10", 4) = 4
read(84, "\0\0\0\1\0\0\0\4", 8) = 8
write(84, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619228330}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619273086}) = 0
select(85, [3 4 5 9 84], [], NULL, NULL) = 1 (in [84])
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619493236}) = 0
read(84, "\0\0\0\10", 4) = 4
read(84, "\20\0\0\4\0\0\0\0", 8) = 8
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619630299}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619671307}) = 0
select(85, [3 4 5 9 84], [84], NULL, NULL) = 1 (out [84])
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619801592}) = 0
write(84, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619900136}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619951778}) = 0
select(85, [3 4 5 9 84], [], NULL, NULL) = 1 (in [84])
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 620130640}) = 0
read(84, "\0\0\0J", 4) = 4
read(84, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"...,
74) = 74
recvmsg(84, 0x7ffe9921d480, 0) = -1 EAGAIN (Resource
temporarily unavailable)
poll([{fd=84, events=POLLIN}], 1, 4294967295) = 1 ([{fd=84, revents=POLLIN}])
recvmsg(84, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {85}}, msg_flags=0}, 0) = 1
recvmsg(84, 0x7ffe9921d480, 0) = -1 EAGAIN (Resource
temporarily unavailable)
poll([{fd=84, events=POLLIN}], 1, 4294967295) = 1 ([{fd=84, revents=POLLIN}])
recvmsg(84, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {86}}, msg_flags=0}, 0) = 1
recvmsg(84, 0x7ffe9921d480, 0) = -1 EAGAIN (Resource
temporarily unavailable)
poll([{fd=84, events=POLLIN}], 1, 4294967295) = 1 ([{fd=84, revents=POLLIN}])
recvmsg(84, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {87}}, msg_flags=0}, 0) = 1
ioctl(85, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(85, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(86, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
ioctl(87, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, {B38400 opost isig icanon echo ...}) = 0
fcntl(85, F_SETFD, FD_CLOEXEC) = 0
fcntl(86, F_SETFD, FD_CLOEXEC) = 0
fcntl(87, F_SETFD, FD_CLOEXEC) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621065833}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621128532}) = 0
select(88, [3 4 5 9], [3], NULL, NULL) = 1 (out [3])
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621248699}) = 0
write(3, "\0\0\0 \201d\201M\343eRg\347*\0232Q\25
\265\345\f(\304\224\272\265\246U\217\247\325"..., 52) = 52
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621355375}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621397659}) = 0
select(88, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574394274}) = 0
accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 88
getsockopt(88, SOL_SOCKET, SO_PEERCRED, {pid=29906, uid=1000,
gid=1000}, [12]) = 0
getuid() = 1000
fcntl(88, F_SETFD, FD_CLOEXEC) = 0
fcntl(88, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(88, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl(88, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574787923}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574830224}) = 0
select(89, [3 4 5 9 88], [88], NULL, NULL) = 2 (in [88], out [88])
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574931974}) = 0
read(88, "\0\0\0\10", 4) = 4
read(88, "\0\0\0\1\0\0\0\4", 8) = 8
write(88, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575318288}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575371615}) = 0
select(89, [3 4 5 9 88], [], NULL, NULL) = 1 (in [88])
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575450221}) = 0
read(88, "\0\0\0\10", 4) = 4
read(88, "\20\0\0\4\0\0\0\0", 8) = 8
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575575486}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575614725}) = 0
select(89, [3 4 5 9 88], [88], NULL, NULL) = 1 (out [88])
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575707881}) = 0
write(88, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575793033}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575834086}) = 0
select(89, [3 4 5 9 88], [], NULL, NULL) = 1 (in [88])
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 576980524}) = 0
read(88, "\0\0\0,", 4) = 4
read(88, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"...,
44) = 44
recvmsg(88, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {89}}, msg_flags=0}, 0) = 1
recvmsg(88, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {90}}, msg_flags=0}, 0) = 1
recvmsg(88, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {91}}, msg_flags=0}, 0) = 1
ioctl(89, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device)
fcntl(89, F_GETFL) = 0x8802 (flags
O_RDWR|O_NONBLOCK|O_LARGEFILE)
ioctl(90, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device)
fcntl(90, F_GETFL) = 0x8002 (flags O_RDWR|O_LARGEFILE)
fcntl(90, F_SETFL, O_RDWR|O_NONBLOCK|O_LARGEFILE) = 0
ioctl(91, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device)
fcntl(91, F_GETFL) = 0x8802 (flags
O_RDWR|O_NONBLOCK|O_LARGEFILE)
fcntl(89, F_SETFD, FD_CLOEXEC) = 0
fcntl(90, F_SETFD, FD_CLOEXEC) = 0
fcntl(91, F_SETFD, FD_CLOEXEC) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577707089}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577748657}) = 0
select(92, [3 4 5 9], [3], NULL, NULL) = 1 (out [3])
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577841298}) = 0
write(3, "\0\0\0
\370L#\214*\37f\343+>4\315S\352[\301\2758\302\301\344\206\235\373#q@\345"...,
52) = 52
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577942105}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577982422}) = 0
select(92, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417300261}) = 0
accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 92
getsockopt(92, SOL_SOCKET, SO_PEERCRED, {pid=33339, uid=1000,
gid=1000}, [12]) = 0
getuid() = 1000
fcntl(92, F_SETFD, FD_CLOEXEC) = 0
fcntl(92, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(92, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl(92, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417563912}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417591948}) = 0
select(93, [3 4 5 9 92], [92], NULL, NULL) = 2 (in [92], out [92])
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417663899}) = 0
read(92, "\0\0\0\10", 4) = 4
read(92, "\0\0\0\1\0\0\0\4", 8) = 8
write(92, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417802399}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417829593}) = 0
select(93, [3 4 5 9 92], [], NULL, NULL) = 1 (in [92])
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417918397}) = 0
read(92, "\0\0\0\10", 4) = 4
read(92, "\20\0\0\4\0\0\0\0", 8) = 8
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418018539}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418046283}) = 0
select(93, [3 4 5 9 92], [92], NULL, NULL) = 1 (out [92])
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418128912}) = 0
write(92, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418235759}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418287367}) = 0
select(93, [3 4 5 9 92], [], NULL, NULL) = 1 (in [92])
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418444113}) = 0
read(92, "\0\0\0,", 4) = 4
read(92, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"...,
44) = 44
recvmsg(92, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {93}}, msg_flags=0}, 0) = 1
recvmsg(92, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {94}}, msg_flags=0}, 0) = 1
recvmsg(92, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}],
msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET,
cmsg_type=SCM_RIGHTS, {95}}, msg_flags=0}, 0) = 1
ioctl(93, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device)
fcntl(93, F_GETFL) = 0x8802 (flags
O_RDWR|O_NONBLOCK|O_LARGEFILE)
ioctl(94, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device)
fcntl(94, F_GETFL) = 0x8002 (flags O_RDWR|O_LARGEFILE)
fcntl(94, F_SETFL, O_RDWR|O_NONBLOCK|O_LARGEFILE) = 0
ioctl(95, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or
TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device)
fcntl(95, F_GETFL) = 0x8802 (flags
O_RDWR|O_NONBLOCK|O_LARGEFILE)
fcntl(93, F_SETFD, FD_CLOEXEC) = 0
fcntl(94, F_SETFD, FD_CLOEXEC) = 0
fcntl(95, F_SETFD, FD_CLOEXEC) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420023362}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420067236}) = 0
select(96, [3 4 5 9], [3], NULL, NULL) = 1 (out [3])
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420167386}) = 0
write(3, "\0\0\0
\221rL\31Z\230\302\362'}A.\256 at P\373\222\336\2321\344I\310D\37\21\301\177"...,
52) = 52
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420282075}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420321699}) = 0
select(96, [3 4 5 9], [], NULL, NULL <detached ...>
--- below was original message previously sent ---
hello ssh list,
long time user of openssh, but relatively new to the concept of ssh
multiplexing. i'm experiencing some issues and i haven't figured out
how to troubleshoot it just yet. would appreciate some help if
possible.
i'm using ssh as a communications mechanism to pass text file based
messages between 2 hosts. There are programs on each side that send
and receive these messages. When I found out about ssh multiplexing, i
was excited to use it because we were seeing several hundred ssh
connections going back and forth between the 2 hosts. when i tried ssh
multiplexing, the message latency dropped dramatically by 10 fold!
however, now that this mechanism has been in use for a week, I'm
starting to see some problems.
First, this is the .ssh/config contents:
Host *
ControlPath ~/.ssh/cm-%r@%h:%p
ControlMaster auto
ControlPersist 10m
Everything seems to work for a few days, but then ssh starts to hang,
and we start seeing several hundred ssh processes all trying to send
their message but cannot. When i try to run ssh by hand, this is what
i get:
$ ssh -vvv boss at ui1
OpenSSH_6.6.1, OpenSSL 1.0.1e-fips 11 Feb 2013
debug1: Reading configuration data /var/lib/worker/.ssh/config
debug1: /var/lib/worker/.ssh/config line 1: Applying options for *
debug1: Reading configuration data /etc/ssh/ssh_config
debug1: /etc/ssh/ssh_config line 56: Applying options for *
debug1: auto-mux: Trying existing master
And it hangs at that point indefinitely until Ctrl-C.
At this point in time, we do see the ssh mux process still running:
$ ps -eo pid,user,args | awk '$2=="worker" && $3=="ssh:" && $5=="[mux]" {print}'
29305 worker ssh: /var/lib/worker/.ssh/cm-boss at ui1:22 [mux]
I tried to attach strace to the ssh mux process, and this is what i
see when the problem is happening:
select(1024, [3 5 9], [], NULL, {0, 11336}) = 0 (Timeout)
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778030739}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778085461}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778109973}) = 0
select(1024, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778186890}) = 0
accept(4, 0x7ffe26b34360, [128]) = -1 EMFILE (Too many open files)
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778263743}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778298340}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778343707}) = 0
select(1024, [3 5 9], [], NULL, {1, 0}) = 0 (Timeout)
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778457543}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778518096}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778546349}) = 0
select(1024, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778627517}) = 0
accept(4, 0x7ffe26b34360, [128]) = -1 EMFILE (Too many open files)
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778693493}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778725395}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778749417}) = 0
select(1024, [3 5 9], [], NULL, {1, 0}) = 0 (Timeout)
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 778904087}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 778963540}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 778988943}) = 0
select(1024, [3 4 5 9], [], NULL, NULL) = 1 (in [4])
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779072887}) = 0
accept(4, 0x7ffe26b34360, [128]) = -1 EMFILE (Too many open files)
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779158255}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779191597}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779216201}) = 0
select(1024, [3 5 9], [], NULL, {1, 0}) = 0 (Timeout)
clock_gettime(0x7 /* CLOCK_??? */, {17873816, 779334945}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873816, 779393178}) = 0
clock_gettime(0x7 /* CLOCK_??? */, {17873816, 779418473}) = 0
Does this indicate a open file limit for this user? Or is this
something else? This is ulimit -a for that user:
-bash-4.2$ ulimit -a
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
scheduling priority (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) 2062375
max locked memory (kbytes, -l) 64
max memory size (kbytes, -m) unlimited
open files (-n) 1024
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) 819200
real-time priority (-r) 0
stack size (kbytes, -s) 8192
cpu time (seconds, -t) unlimited
max user processes (-u) 4096
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
Any advice on how to troubleshoot this further? Thanks in advance...
More information about the openssh-unix-dev
mailing list