mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
Direct agents rebalance improvements with multiple management server nodes (#10674)
Sometimes hypervisor hosts (direct agents) stuck with Disconnect state during agent rebalancing activity across multiple management server nodes. This issue was noticed during frequent restart of the management server nodes in the cluster. When there are multiple management server nodes in a cluster, if one or more nodes are shutdown/start/restart, CloudStack will rebalance the hosts among the remaining nodes or move the nodes to the newly joined management server nodes. During the rebalancing period multiple operations could happen including: - DirectAgentScan at interval of configured direct.agent.scan.interval - AgentRebalanceScan to identify and schedule rebalance agents - TransferAgentScan to transfer the host from original owner to future owner **Current Rebalance behavior** 1. For hosts that have AgentAttache && not forForward but in Disconnect state, CloudStack simply ignore these hosts without trying to ping again or update the status of the host. 2. For hosts that have AgentAttache && forForward, CloudStack removes the agent but still try to loadDirectlyConnectedHost. **Improved Rebalance behavior** During DirectAgentScan: scanDirectAgentToLoad(), identify hosts that for self-managed hosts that are in Disconnect state (disconnected after pingtimeout). 1. For hosts that have AgentAttache and is forForward, CloudStack should remove the agent 2. For hosts that have AgentAttache and is not forForward but in Disconnect state, CloudStack should try to investigate and update the status to Up if host is pingable. 3. For hosts that don't have AgentAttache, CloudStack should try to loadDirectlyConnectedHost.
This commit is contained in:
parent
0648d000b2
commit
95489b8bdd
@ -191,7 +191,7 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
|
||||
scanDirectAgentToLoad();
|
||||
}
|
||||
|
||||
private void scanDirectAgentToLoad() {
|
||||
protected void scanDirectAgentToLoad() {
|
||||
logger.trace("Begin scanning directly connected hosts");
|
||||
|
||||
// for agents that are self-managed, threshold to be considered as disconnected after pingtimeout
|
||||
@ -212,11 +212,21 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
|
||||
logger.info("{} is detected down, but we have a forward attache running, disconnect this one before launching the host", host);
|
||||
removeAgent(agentattache, Status.Disconnected);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
logger.debug("Loading directly connected {}", host);
|
||||
logger.debug("Host {} status is {} but has an AgentAttache which is not forForward, try to load directly", host, host.getStatus());
|
||||
Status hostStatus = investigate(agentattache);
|
||||
if (Status.Up == hostStatus) {
|
||||
/* Got ping response from host, bring it back */
|
||||
logger.info("After investigation, Agent for host {} is determined to be up and running", host);
|
||||
agentStatusTransitTo(host, Event.Ping, _nodeId);
|
||||
} else {
|
||||
logger.debug("After investigation, AgentAttache is not null but host status is {}, try to load directly {}", hostStatus, host);
|
||||
loadDirectlyConnectedHost(host, false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logger.debug("AgentAttache is null, loading directly connected {}", host);
|
||||
loadDirectlyConnectedHost(host, false);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
logger.warn(" can not load directly connected {} due to ", host, e);
|
||||
}
|
||||
@ -362,7 +372,7 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
|
||||
return;
|
||||
}
|
||||
if (!result) {
|
||||
throw new CloudRuntimeException("Failed to propagate agent change request event:" + Event.ShutdownRequested + " to host:" + hostId);
|
||||
throw new CloudRuntimeException(String.format("Failed to propagate agent change request event: %s to host: %s", Event.ShutdownRequested, hostId));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,150 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package com.cloud.agent.manager;
|
||||
|
||||
import com.cloud.configuration.ManagementServiceConfiguration;
|
||||
import com.cloud.ha.HighAvailabilityManagerImpl;
|
||||
import com.cloud.host.HostVO;
|
||||
import com.cloud.host.Status;
|
||||
import com.cloud.host.dao.HostDao;
|
||||
import com.cloud.resource.ResourceManagerImpl;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.MockitoJUnitRunner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.anyBoolean;
|
||||
import static org.mockito.ArgumentMatchers.anyLong;
|
||||
import static org.mockito.Mockito.doReturn;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@RunWith(MockitoJUnitRunner.class)
|
||||
public class ClusteredAgentManagerImplTest {
|
||||
|
||||
private HostDao _hostDao;
|
||||
@Mock
|
||||
ManagementServiceConfiguration _mgmtServiceConf;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
_hostDao = mock(HostDao.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scanDirectAgentToLoadNoHostsTest() {
|
||||
ClusteredAgentManagerImpl clusteredAgentManagerImpl = mock(ClusteredAgentManagerImpl.class);
|
||||
clusteredAgentManagerImpl._hostDao = _hostDao;
|
||||
clusteredAgentManagerImpl.scanDirectAgentToLoad();
|
||||
verify(clusteredAgentManagerImpl, never()).findAttache(anyLong());
|
||||
verify(clusteredAgentManagerImpl, never()).loadDirectlyConnectedHost(any(), anyBoolean());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scanDirectAgentToLoadHostWithoutAttacheTest() {
|
||||
// Arrange
|
||||
ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(ClusteredAgentManagerImpl.class);
|
||||
HostVO hostVO = mock(HostVO.class);
|
||||
clusteredAgentManagerImpl._hostDao = _hostDao;
|
||||
clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf;
|
||||
clusteredAgentManagerImpl._resourceMgr = mock(ResourceManagerImpl.class);
|
||||
when(_mgmtServiceConf.getTimeout()).thenReturn(16000L);
|
||||
when(hostVO.getId()).thenReturn(1L);
|
||||
List hosts = new ArrayList<>();
|
||||
hosts.add(hostVO);
|
||||
when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts);
|
||||
AgentAttache agentAttache = mock(AgentAttache.class);
|
||||
doReturn(Boolean.TRUE).when(clusteredAgentManagerImpl).loadDirectlyConnectedHost(hostVO, false);
|
||||
clusteredAgentManagerImpl.scanDirectAgentToLoad();
|
||||
verify(clusteredAgentManagerImpl).loadDirectlyConnectedHost(hostVO, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scanDirectAgentToLoadHostWithForwardAttacheTest() {
|
||||
ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(ClusteredAgentManagerImpl.class);
|
||||
HostVO hostVO = mock(HostVO.class);
|
||||
clusteredAgentManagerImpl._hostDao = _hostDao;
|
||||
clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf;
|
||||
when(_mgmtServiceConf.getTimeout()).thenReturn(16000L);
|
||||
when(hostVO.getId()).thenReturn(1L);
|
||||
List hosts = new ArrayList<>();
|
||||
hosts.add(hostVO);
|
||||
when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts);
|
||||
AgentAttache agentAttache = mock(AgentAttache.class);
|
||||
when(agentAttache.forForward()).thenReturn(Boolean.TRUE);
|
||||
when(clusteredAgentManagerImpl.findAttache(1L)).thenReturn(agentAttache);
|
||||
|
||||
clusteredAgentManagerImpl.scanDirectAgentToLoad();
|
||||
verify(clusteredAgentManagerImpl).removeAgent(agentAttache, Status.Disconnected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scanDirectAgentToLoadHostWithNonForwardAttacheTest() {
|
||||
// Arrange
|
||||
ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(new ClusteredAgentManagerImpl());
|
||||
HostVO hostVO = mock(HostVO.class);
|
||||
clusteredAgentManagerImpl._hostDao = _hostDao;
|
||||
clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf;
|
||||
clusteredAgentManagerImpl._haMgr = mock(HighAvailabilityManagerImpl.class);
|
||||
when(_mgmtServiceConf.getTimeout()).thenReturn(16000L);
|
||||
when(hostVO.getId()).thenReturn(0L);
|
||||
List hosts = new ArrayList<>();
|
||||
hosts.add(hostVO);
|
||||
when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts);
|
||||
|
||||
AgentAttache agentAttache = mock(AgentAttache.class);
|
||||
when(agentAttache.forForward()).thenReturn(Boolean.FALSE);
|
||||
when(clusteredAgentManagerImpl.findAttache(0L)).thenReturn(agentAttache);
|
||||
doReturn(Boolean.TRUE).when(clusteredAgentManagerImpl).agentStatusTransitTo(hostVO, Status.Event.Ping, clusteredAgentManagerImpl._nodeId);
|
||||
doReturn(Status.Up).when(clusteredAgentManagerImpl).investigate(agentAttache);
|
||||
|
||||
clusteredAgentManagerImpl.scanDirectAgentToLoad();
|
||||
verify(clusteredAgentManagerImpl).investigate(agentAttache);
|
||||
verify(clusteredAgentManagerImpl).agentStatusTransitTo(hostVO, Status.Event.Ping, clusteredAgentManagerImpl._nodeId);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scanDirectAgentToLoadHostWithNonForwardAttacheAndDisconnectedTest() {
|
||||
ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(ClusteredAgentManagerImpl.class);
|
||||
HostVO hostVO = mock(HostVO.class);
|
||||
clusteredAgentManagerImpl._hostDao = _hostDao;
|
||||
clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf;
|
||||
clusteredAgentManagerImpl._haMgr = mock(HighAvailabilityManagerImpl.class);
|
||||
clusteredAgentManagerImpl._resourceMgr = mock(ResourceManagerImpl.class);
|
||||
when(_mgmtServiceConf.getTimeout()).thenReturn(16000L);
|
||||
when(hostVO.getId()).thenReturn(0L);
|
||||
List hosts = new ArrayList<>();
|
||||
hosts.add(hostVO);
|
||||
when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts);
|
||||
AgentAttache agentAttache = mock(AgentAttache.class);
|
||||
when(agentAttache.forForward()).thenReturn(Boolean.FALSE);
|
||||
when(clusteredAgentManagerImpl.findAttache(0L)).thenReturn(agentAttache);
|
||||
doReturn(Boolean.TRUE).when(clusteredAgentManagerImpl).loadDirectlyConnectedHost(hostVO, false);
|
||||
clusteredAgentManagerImpl.scanDirectAgentToLoad();
|
||||
verify(clusteredAgentManagerImpl).investigate(agentAttache);
|
||||
verify(clusteredAgentManagerImpl).loadDirectlyConnectedHost(hostVO, false);
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user