@@ -82,6 +82,7 @@ vlog = ovs.vlog.Vlog("ovs-monitor-ipsec")
exiting = False
monitor = None
xfrm = None
+TIEMOUT_EXPIRED = 37
def run_command(args, description=None):
@@ -94,7 +95,16 @@ def run_command(args, description=None):
vlog.dbg("Running %s" % args)
proc = subprocess.Popen(args, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
- pout, perr = proc.communicate()
+ try:
+ pout, perr = proc.communicate(timeout=120)
+ ret = proc.returncode
+ except subprocess.TimeoutExpired:
+ vlog.warn("Command timed out trying to %s." % description)
+ pout, perr = b'', b''
+ # Just kill the process here. We can't afford waiting for it,
+ # as it may be stuck and may not actually be terminated.
+ proc.kill()
+ ret = TIEMOUT_EXPIRED
if proc.returncode or len(perr):
vlog.warn("Failed to %s; exit code: %d"
@@ -103,7 +113,7 @@ def run_command(args, description=None):
vlog.warn("stderr: %s" % perr)
vlog.warn("stdout: %s" % pout)
- return proc.returncode, pout or b'', perr or b''
+ return ret, pout or b'', perr or b''
class XFRM(object):
Multiple versions of Libreswan have an issue where ipsec --start command may get stuck forever. This issue affects many popular versions of Libreswan from 4.5 to 4.15, which are shipped in most modern distributions. When ipsec --start gets stuck, ovs-monitor-ipsec hangs and can't do anything else, so not olny this one but all other tunnels are also not being started. Add a timeout to the subprocess call, so we do not wait forever. Just introduced reconciliation process will clean things up and will try to re-add this connection later. Pluto may take a lot of time to process the --start request. Notably, the time depends on the retransmission timeout, which is 60 seconds by default. However, even at high scale, it doesn't take much more than that in tests. So, 120 second timeout should be a reasonable default value. Note: it is observed in practice that the process doesn't actually terminate for a long time, so we can't afford waiting for it. That's the main reason why we're not using the subprocess.run() with a timeout option here (it would wait). But also, because we'd had to catch the exception anyway. Reported-at: https://issues.redhat.com/browse/FDP-846 Signed-off-by: Ilya Maximets <i.maximets@ovn.org> --- ipsec/ovs-monitor-ipsec.in | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)