From c3b77cb7b82bfe8e94b024e81e1721532da51a9c Mon Sep 17 00:00:00 2001 From: Vishesh Date: Mon, 15 Jan 2024 13:56:34 +0530 Subject: [PATCH] Fix host stuck in connecting state (#8502) There are a lot of test failures due to test_vm_life_cycle.py in multiple PRs due to host not available for migration of VMs. #8438 (comment) #8433 (comment) #7344 (comment) While debugging I noticed that the hosts get stuck in Connecting state because MS is waiting for a response of the ReadyCommand from the agent. Since we take a lock on connection and disconnection, restarting the agent doesn't work. To fix this, we have to restart the MS or wait for ~1 hour (default timeout). On the agent side, it gets stuck waiting for a response from the Script execution. To reproduce, run smoke/test_vm_life_cycle.py (TestSecuredVmMigration test class to be specific). Once the tests are complete, you will notice that some hosts are stuck in Connecting state. And restarting the agent fails due to the named lock. Locks on DB can be checked using the below query. SELECT * FROM performance_schema.metadata_locks INNER JOIN performance_schema.threads ON THREAD_ID = OWNER_THREAD_ID WHERE PROCESSLIST_ID <> CONNECTION_ID() \G; This PR adds a wait for the ready command and a timeout to the Script execution to ensure that the thread doesn't get stuck and the named lock from database is released. --- .../src/main/java/com/cloud/agent/manager/AgentManagerImpl.java | 1 + .../storage/datastore/provider/DefaultHostListener.java | 1 + .../kvm/resource/wrapper/LibvirtReadyCommandWrapper.java | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index 023f9c18e0a..d8671ed29df 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -596,6 +596,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl final Long dcId = host.getDataCenterId(); final ReadyCommand ready = new ReadyCommand(dcId, host.getId(), NumbersUtil.enableHumanReadableSizes); + ready.setWait(60); final Answer answer = easySend(hostId, ready); if (answer == null || !answer.getResult()) { // this is tricky part for secondary storage diff --git a/engine/storage/volume/src/main/java/org/apache/cloudstack/storage/datastore/provider/DefaultHostListener.java b/engine/storage/volume/src/main/java/org/apache/cloudstack/storage/datastore/provider/DefaultHostListener.java index e344a87831d..90e8742c84d 100644 --- a/engine/storage/volume/src/main/java/org/apache/cloudstack/storage/datastore/provider/DefaultHostListener.java +++ b/engine/storage/volume/src/main/java/org/apache/cloudstack/storage/datastore/provider/DefaultHostListener.java @@ -121,6 +121,7 @@ public class DefaultHostListener implements HypervisorHostListener { public boolean hostConnect(long hostId, long poolId) throws StorageConflictException { StoragePool pool = (StoragePool) this.dataStoreMgr.getDataStore(poolId, DataStoreRole.Primary); ModifyStoragePoolCommand cmd = new ModifyStoragePoolCommand(true, pool); + cmd.setWait(60); final Answer answer = agentMgr.easySend(hostId, cmd); if (answer == null) { diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtReadyCommandWrapper.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtReadyCommandWrapper.java index fc57cd412f0..4df74decdea 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtReadyCommandWrapper.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtReadyCommandWrapper.java @@ -55,7 +55,7 @@ public final class LibvirtReadyCommandWrapper extends CommandWrapper