Merge pull request #1293 from miguelaferreira/nsx-heath-check

Add Health Check Command to NSX pluginThe NSX plugin does not support the HeathCheckCommand. Instead it fakes a PingCommand as a call tot he control cluster status API.
However, we have seen in production that the management server will sometimes find the NSX controller to be behind on ping and that will trigger a HealthCheckCommand which will return with an unsupported command answer.
Once this happens the controller is put into Alert state and will not recover until the management sever is restarted.

In addition, during the investigation, there will be a null pointer exception due tot he fact that the NSX controllers do not live in a pod.

This PR tries to address those two issues.

* pr/1293:
  Implement CheckHealthCommand for NSX controllers
  Fix log message that refers to agent, not host
  Prevent NullPointerException when host does not belong to a pod

Signed-off-by: Remi Bergsma <github@remi.nl>
This commit is contained in:
Remi Bergsma 2016-01-16 20:39:43 +01:00
commit a767407fd2
3 changed files with 214 additions and 71 deletions

View File

@ -121,10 +121,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
protected static final Logger status_logger = Logger.getLogger(Status.class);
/**
* _agents is a ConcurrentHashMap, but it is used from within a synchronized block.
* This will be reported by findbugs as JLM_JSR166_UTILCONCURRENT_MONITORENTER.
* Maybe a ConcurrentHashMap is not the right thing to use here, but i'm not sure
* so i leave it alone.
* _agents is a ConcurrentHashMap, but it is used from within a synchronized block. This will be reported by findbugs as JLM_JSR166_UTILCONCURRENT_MONITORENTER. Maybe a
* ConcurrentHashMap is not the right thing to use here, but i'm not sure so i leave it alone.
*/
protected ConcurrentHashMap<Long, AgentAttache> _agents = new ConcurrentHashMap<Long, AgentAttache>(10007);
protected List<Pair<Integer, Listener>> _hostMonitors = new ArrayList<Pair<Integer, Listener>>(17);
@ -821,9 +819,9 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
try {
nextStatus = host.getStatus().getNextStatus(event);
} catch (final NoTransitionException ne) {
/* Agent may be currently in status of Down, Alert, Removed, namely there is no next status for some events.
* Why this can happen? Ask God not me. I hate there was no piece of comment for code handling race condition.
* God knew what race condition the code dealt with!
/*
* Agent may be currently in status of Down, Alert, Removed, namely there is no next status for some events. Why this can happen? Ask God not me. I hate there was
* no piece of comment for code handling race condition. God knew what race condition the code dealt with!
*/
s_logger.debug("Caught exception while getting agent's next status", ne);
}
@ -845,7 +843,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
}
final Status currentStatus = host.getStatus();
s_logger.info("The agent " + hostId + " state determined is " + determinedState);
s_logger.info("The agent from host " + hostId + " state determined is " + determinedState);
if (determinedState == Status.Down) {
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
@ -883,7 +881,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
// if we end up here we are in alert state, send an alert
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
final HostPodVO podVO = _podDao.findById(host.getPodId());
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
final String podName = podVO != null ? podVO.getName() : "NO POD";
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podName;
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc,
"In availability zone " + host.getDataCenterId() + ", host is in alert state: " + host.getId() + "-" + host.getName());
}
@ -1141,7 +1140,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
}
protected void connectAgent(final Link link, final Command[] cmds, final Request request) {
//send startupanswer to agent in the very beginning, so agent can move on without waiting for the answer for an undetermined time, if we put this logic into another thread pool.
// send startupanswer to agent in the very beginning, so agent can move on without waiting for the answer for an undetermined time, if we put this logic into another
// thread pool.
final StartupAnswer[] answers = new StartupAnswer[cmds.length];
Command cmd;
for (int i = 0; i < cmds.length; i++) {
@ -1250,7 +1250,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
final HostPodVO podVO = _podDao.findById(host.getPodId());
final String hostDesc =
"name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
"name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: "
+ podVO.getName();
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_ROUTING, host.getDataCenterId(), host.getPodId(),
"Host lost connection to gateway, " + hostDesc, "Host [" + hostDesc +
@ -1533,11 +1534,9 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
final ResourceState resourceState = h.getResourceState();
if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance || resourceState == ResourceState.ErrorInMaintenance) {
/*
* Host is in non-operation state, so no
* investigation and direct put agent to
* Disconnected
* Host is in non-operation state, so no investigation and direct put agent to Disconnected
*/
status_logger.debug("Ping timeout but host " + agentId + " is in resource state of " + resourceState + ", so no investigation");
status_logger.debug("Ping timeout but agent " + agentId + " is in resource state of " + resourceState + ", so no investigation");
disconnectWithoutInvestigation(agentId, Event.ShutdownRequested);
} else {
final HostVO host = _hostDao.findById(agentId);
@ -1547,7 +1546,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
s_logger.warn("Disconnect agent for CPVM/SSVM due to physical connection close. host: " + host.getId());
disconnectWithoutInvestigation(agentId, Event.ShutdownRequested);
} else {
status_logger.debug("Ping timeout for host " + agentId + ", do invstigation");
status_logger.debug("Ping timeout for agent " + agentId + ", do invstigation");
disconnectWithInvestigation(agentId, Event.PingTimeout);
}
}
@ -1653,7 +1652,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
@Override
public ConfigKey<?>[] getConfigKeys() {
return new ConfigKey<?>[] {CheckTxnBeforeSending, Workers, Port, PingInterval, PingTimeout, Wait, AlertWait, DirectAgentLoadSize, DirectAgentPoolSize, DirectAgentThreadCap};
return new ConfigKey<?>[] { CheckTxnBeforeSending, Workers, Port, PingInterval, PingTimeout, Wait, AlertWait, DirectAgentLoadSize, DirectAgentPoolSize,
DirectAgentThreadCap };
}
}

View File

@ -0,0 +1,63 @@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
package com.cloud.network.resource.wrapper;
import org.apache.log4j.Logger;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckHealthAnswer;
import com.cloud.agent.api.CheckHealthCommand;
import com.cloud.network.nicira.ControlClusterStatus;
import com.cloud.network.nicira.NiciraNvpApi;
import com.cloud.network.nicira.NiciraNvpApiException;
import com.cloud.network.resource.NiciraNvpResource;
import com.cloud.resource.CommandWrapper;
import com.cloud.resource.ResourceWrapper;
@ResourceWrapper(handles = CheckHealthCommand.class)
public class NiciraCheckHealthCommandWrapper extends CommandWrapper<CheckHealthCommand, Answer, NiciraNvpResource> {
private static final String CONTROL_CLUSTER_STATUS_IS_STABLE = "stable";
private static final Logger s_logger = Logger.getLogger(NiciraCheckHealthCommandWrapper.class);
@Override
public Answer execute(final CheckHealthCommand command, final NiciraNvpResource serverResource) {
final NiciraNvpApi niciraNvpApi = serverResource.getNiciraNvpApi();
boolean healthy = true;
try {
final ControlClusterStatus clusterStatus = niciraNvpApi.getControlClusterStatus();
final String status = clusterStatus.getClusterStatus();
if (clusterIsUnstable(status)) {
s_logger.warn("Control cluster is not stable. Current status is " + status);
healthy = false;
}
} catch (final NiciraNvpApiException e) {
s_logger.error("Exception caught while checking control cluster status during health check", e);
healthy = false;
}
return new CheckHealthAnswer(command, healthy);
}
protected boolean clusterIsUnstable(final String clusterStatus) {
return !CONTROL_CLUSTER_STATUS_IS_STABLE.equals(clusterStatus);
}
}

View File

@ -0,0 +1,80 @@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
package com.cloud.network.resource.wrapper;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import org.junit.Before;
import org.junit.Test;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckHealthCommand;
import com.cloud.network.nicira.ControlClusterStatus;
import com.cloud.network.nicira.NiciraNvpApi;
import com.cloud.network.nicira.NiciraNvpApiException;
import com.cloud.network.resource.NiciraNvpResource;
public class NiciraCheckHealthCommandWrapperTest {
private final NiciraNvpResource niciraResource = mock(NiciraNvpResource.class);
private final NiciraNvpApi niciraApi = mock(NiciraNvpApi.class);
@Before
public void setup() {
when(niciraResource.getNiciraNvpApi()).thenReturn(niciraApi);
}
@Test
public void tetsExecuteWhenClusterIsNotStable() throws Exception {
when(niciraApi.getControlClusterStatus()).thenReturn(new ControlClusterStatus());
final NiciraCheckHealthCommandWrapper commandWrapper = new NiciraCheckHealthCommandWrapper();
final Answer answer = commandWrapper.execute(new CheckHealthCommand(), niciraResource);
assertThat(answer.getResult(), equalTo(false));
}
@SuppressWarnings("unchecked")
@Test
public void tetsExecuteWhenApiThrowsException() throws Exception {
when(niciraApi.getControlClusterStatus()).thenThrow(NiciraNvpApiException.class);
final NiciraCheckHealthCommandWrapper commandWrapper = new NiciraCheckHealthCommandWrapper();
final Answer answer = commandWrapper.execute(new CheckHealthCommand(), niciraResource);
assertThat(answer.getResult(), equalTo(false));
}
@Test
public void tetsExecuteWhenClusterIsStable() throws Exception {
final ControlClusterStatus statusValue = mock(ControlClusterStatus.class);
when(statusValue.getClusterStatus()).thenReturn("stable");
when(niciraApi.getControlClusterStatus()).thenReturn(statusValue);
final NiciraCheckHealthCommandWrapper commandWrapper = new NiciraCheckHealthCommandWrapper();
final Answer answer = commandWrapper.execute(new CheckHealthCommand(), niciraResource);
assertThat(answer.getResult(), equalTo(true));
}
}