mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
Host-HA offers investigation, fencing and recovery mechanisms for host that for any reason are malfunctioning. It uses Activity and Health checks to determine current host state based on which it may degrade a host or try to recover it. On failing to recover it, it may try to fence the host. The core feature is implemented in a hypervisor agnostic way, with two separate implementations of the driver/provider for Simulator and KVM hypervisors. The framework also allows for implementation of other hypervisor specific provider implementation in future. The Host-HA provider implementation for KVM hypervisor uses the out-of-band management sub-system to issue IPMI calls to reset (recover) or poweroff (fence) a host. The Host-HA provider implementation for Simulator provides a means of testing and validating the core framework implementation. Signed-off-by: Abhinandan Prateek <abhinandan.prateek@shapeblue.com> Signed-off-by: Rohit Yadav <rohit.yadav@shapeblue.com>
536 lines
20 KiB
Python
536 lines
20 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
|
|
from marvin.cloudstackTestCase import *
|
|
from marvin.lib.utils import *
|
|
from marvin.lib.base import *
|
|
from marvin.lib.common import *
|
|
from nose.plugins.attrib import attr
|
|
|
|
from ipmisim.ipmisim import IpmiServerContext, IpmiServer, ThreadedIpmiServer
|
|
|
|
import random
|
|
import socket
|
|
import thread
|
|
|
|
|
|
class TestHaKVMAgent(cloudstackTestCase):
|
|
""" Test cases for out of band management
|
|
"""
|
|
|
|
def setUp(self):
|
|
testClient = super(TestHaKVMAgent, self).getClsTestClient()
|
|
|
|
self.apiClient = testClient.getApiClient()
|
|
self.dbclient = testClient.getDbConnection()
|
|
self.services = testClient.getParsedTestDataConfig()
|
|
|
|
self.zone = get_zone(self.apiClient, testClient.getZoneForTests())
|
|
self.host = self.getHost()
|
|
self.cluster_id = self.host.clusterid
|
|
self.server = None
|
|
|
|
self.hypervisor = self.testClient.getHypervisorInfo()
|
|
self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
|
|
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
|
|
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
|
|
|
|
# Cleanup any existing configs
|
|
self.dbclient.execute("delete from ha_config where resource_type='Host'")
|
|
|
|
# use random port for ipmisim
|
|
s = socket.socket()
|
|
s.bind(('', 0))
|
|
self.serverPort = s.getsockname()[1]
|
|
s.close()
|
|
|
|
# Set Cluster-level setting in order to run tests faster
|
|
self.update_configuration("kvm.ha.activity.check.failure.ratio", "0.7")
|
|
self.update_configuration("kvm.ha.activity.check.interval", "10")
|
|
self.update_configuration("kvm.ha.activity.check.max.attempts", "5")
|
|
self.update_configuration("kvm.ha.activity.check.timeout", "60")
|
|
self.update_configuration("kvm.ha.degraded.max.period", "30")
|
|
self.update_configuration("kvm.ha.fence.timeout", "60")
|
|
self.update_configuration("kvm.ha.health.check.timeout", "10")
|
|
self.update_configuration("kvm.ha.recover.failure.threshold", "1")
|
|
self.update_configuration("kvm.ha.recover.timeout", "120")
|
|
self.update_configuration("kvm.ha.recover.wait.period", "60")
|
|
|
|
self.service_offering = ServiceOffering.create(
|
|
self.apiClient,
|
|
self.services["service_offerings"]
|
|
)
|
|
|
|
self.template = get_template(
|
|
self.apiClient,
|
|
self.zone.id,
|
|
self.services["ostype"]
|
|
)
|
|
|
|
self.cleanup = [self.service_offering]
|
|
|
|
def tearDown(self):
|
|
try:
|
|
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from cluster_details where name='resourceHAEnabled'")
|
|
self.dbclient.execute("delete from data_center_details where name='resourceHAEnabled'")
|
|
self.dbclient.execute("delete from ha_config where resource_type='Host'")
|
|
self.dbclient.execute("delete from oobm where port=%d" % self.getIpmiServerPort())
|
|
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from cluster_details where name='outOfBandManagementEnabled'")
|
|
self.dbclient.execute("delete from data_center_details where name='outOfBandManagementEnabled'")
|
|
cleanup_resources(self.apiClient, self.cleanup)
|
|
if self.server:
|
|
self.server.shutdown()
|
|
self.server.server_close()
|
|
except Exception as e:
|
|
raise Exception("Warning: Exception during cleanup : %s" % e)
|
|
|
|
def getFakeMsId(self):
|
|
return self.fakeMsId
|
|
|
|
def getFakeMsRunId(self):
|
|
return self.fakeMsId * 1000
|
|
|
|
def getHostHaConfigCmd(self, provider='kvmhaprovider'):
|
|
cmd = configureHAForHost.configureHAForHostCmd()
|
|
cmd.provider = provider
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getHostHaEnableCmd(self):
|
|
cmd = enableHAForHost.enableHAForHostCmd()
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getHost(self, hostId=None):
|
|
response = list_hosts(
|
|
self.apiClient,
|
|
zoneid=self.zone.id,
|
|
type='Routing',
|
|
id=hostId
|
|
)
|
|
if len(response) > 0:
|
|
self.host = response[0]
|
|
return self.host
|
|
raise self.skipTest("No hosts found, skipping out-of-band management test")
|
|
|
|
def getIpmiServerIp(self):
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
s.connect((self.mgtSvrDetails["mgtSvrIp"], self.mgtSvrDetails["port"]))
|
|
return s.getsockname()[0]
|
|
|
|
def getIpmiServerPort(self):
|
|
return self.serverPort
|
|
|
|
def getOobmConfigCmd(self):
|
|
cmd = configureOutOfBandManagement.configureOutOfBandManagementCmd()
|
|
cmd.driver = 'ipmitool' # The default available driver
|
|
cmd.address = self.getIpmiServerIp()
|
|
cmd.port = self.getIpmiServerPort()
|
|
cmd.username = 'admin'
|
|
cmd.password = 'password'
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getOobmEnableCmd(self):
|
|
cmd = enableOutOfBandManagementForHost.enableOutOfBandManagementForHostCmd()
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getOobmDisableCmd(self):
|
|
cmd = disableOutOfBandManagementForHost.disableOutOfBandManagementForHostCmd()
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getOobmIssueActionCmd(self):
|
|
cmd = issueOutOfBandManagementPowerAction.issueOutOfBandManagementPowerActionCmd()
|
|
cmd.hostid = self.host.id
|
|
cmd.action = 'STATUS'
|
|
return cmd
|
|
|
|
def issue_power_action_cmd(self, action, timeout=None):
|
|
cmd = self.getOobmIssueActionCmd()
|
|
cmd.action = action
|
|
if timeout:
|
|
cmd.timeout = timeout
|
|
|
|
try:
|
|
return self.apiClient.issueOutOfBandManagementPowerAction(cmd)
|
|
except Exception as e:
|
|
if "packet session id 0x0 does not match active session" in str(e):
|
|
raise self.skipTest("Known ipmitool issue hit, skipping test")
|
|
raise e
|
|
|
|
def configure_and_enable_oobm(self):
|
|
self.apiClient.configureOutOfBandManagement(self.getOobmConfigCmd())
|
|
response = self.apiClient.enableOutOfBandManagementForHost(self.getOobmEnableCmd())
|
|
self.assertEqual(response.enabled, True)
|
|
|
|
def start_ipmi_server(self):
|
|
def startIpmiServer(tname, server):
|
|
self.debug("Starting ipmisim server")
|
|
try:
|
|
server.serve_forever()
|
|
except Exception: pass
|
|
IpmiServerContext('reset')
|
|
ThreadedIpmiServer.allow_reuse_address = False
|
|
server = ThreadedIpmiServer(('0.0.0.0', self.getIpmiServerPort()), IpmiServer)
|
|
thread.start_new_thread(startIpmiServer, ("ipmi-server", server,))
|
|
self.server = server
|
|
|
|
def checkSyncToState(self, state, interval):
|
|
def checkForStateSync(expectedState):
|
|
response = self.getHost(hostId=self.host.id).outofbandmanagement
|
|
return response.powerstate == expectedState, None
|
|
|
|
sync_interval = 1 + int(interval)/1000
|
|
res, _ = wait_until(sync_interval, 10, checkForStateSync, state)
|
|
if not res:
|
|
self.fail("Failed to get host.powerstate synced to expected state:" + state)
|
|
response = self.getHost(hostId=self.host.id).outofbandmanagement
|
|
self.assertEqual(response.powerstate, state)
|
|
|
|
def get_host_in_available_state(self):
|
|
|
|
self.configure_and_start_ipmi_server()
|
|
self.assert_issue_command_state('ON', 'On')
|
|
self.configureAndEnableHostHa()
|
|
|
|
self.check_host_transition_to_available()
|
|
|
|
response = self.getHost()
|
|
if response.hostha.hastate is not "Available":
|
|
print response
|
|
|
|
self.assertEqual(response.hostha.hastate, "Available")
|
|
|
|
def configureAndEnableHostHa(self):
|
|
self.apiClient.configureHAForHost(self.getHostHaConfigCmd())
|
|
|
|
response = self.apiClient.enableHAForHost(self.getHostHaEnableCmd())
|
|
self.assertEqual(response.haenable, True)
|
|
|
|
def configure_and_start_ipmi_server(self, power_state=None):
|
|
"""
|
|
Setup ipmisim and enable out-of-band management for host
|
|
"""
|
|
self.configure_and_enable_oobm()
|
|
self.start_ipmi_server()
|
|
if power_state:
|
|
bmc = IpmiServerContext().bmc
|
|
bmc.powerstate = power_state
|
|
|
|
def assert_issue_command_state(self, command, expected):
|
|
"""
|
|
Asserts power action result for a given power command
|
|
"""
|
|
if command != 'STATUS':
|
|
self.issue_power_action_cmd(command)
|
|
response = self.issue_power_action_cmd('STATUS')
|
|
self.assertEqual(response.powerstate, expected)
|
|
|
|
def kill_agent(self):
|
|
t_end = time.time() + 90
|
|
while time.time() < t_end:
|
|
try:
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
|
|
passwd=self.hostConfig["password"]).execute \
|
|
("kill $(ps aux | grep 'cloudstack-agent' | awk '{print $2}')")
|
|
return
|
|
except Exception:
|
|
print("Cannot ssh into: " + self.host.ipaddress)
|
|
self.fail(self)
|
|
|
|
def set_host_to_alert(self):
|
|
self.dbclient.execute("update host set host.status = 'Alert' where host.uuid = '%s'" % self.host.id)
|
|
|
|
def check_host_transitioned_to_degraded(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Degraded":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_fencing(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Fencing":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def check_host_transitioned_to_suspect(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Suspect":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def check_host_transitioned_to_checking(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Checking":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_fenced(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Fenced":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_up(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.state in "Up":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def stop_agent(self):
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
|
|
("service cloudstack-agent stop")
|
|
|
|
def start_agent(self):
|
|
self.ssh_and_restart_agent()
|
|
self.check_host_transition_to_available()
|
|
|
|
def ssh_and_restart_agent(self):
|
|
t_end = time.time() + 90
|
|
while time.time() < t_end:
|
|
try:
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
|
|
passwd=self.hostConfig["password"]).execute \
|
|
("service cloudstack-agent restart")
|
|
return
|
|
except Exception:
|
|
print("Cannot ssh into: " + self.host.ipaddress)
|
|
self.fail(self)
|
|
|
|
def check_host_transition_to_available(self):
|
|
t_end = time.time() + 90
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate == "Available":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_recovered(self):
|
|
t_end = time.time() + 180
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Recovered":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def reset_host(self):
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
|
|
passwd=self.hostConfig["password"]).execute \
|
|
("reboot")
|
|
|
|
def deploy_vm(self):
|
|
vm = VirtualMachine.create(
|
|
self.apiClient,
|
|
services=self.services["virtual_machine"],
|
|
serviceofferingid=self.service_offering.id,
|
|
templateid=self.template.id,
|
|
zoneid=self.zone.id,
|
|
hostid = self.host.id,
|
|
method="POST"
|
|
)
|
|
|
|
self.cleanup.append(vm)
|
|
|
|
def update_configuration(self, name, value):
|
|
update_configuration_cmd = updateConfiguration.updateConfigurationCmd()
|
|
update_configuration_cmd.name = name
|
|
update_configuration_cmd.value = value
|
|
update_configuration_cmd.clusterid = self.cluster_id
|
|
|
|
self.apiClient.updateConfiguration(update_configuration_cmd)
|
|
|
|
|
|
@attr(tags = ["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_stop_agent_host_is_degraded(self):
|
|
"""
|
|
Tests HA state turns Degraded when agent is stopped
|
|
"""
|
|
self.deploy_vm()
|
|
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.stop_agent()
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_degraded()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Enable Host
|
|
self.start_agent()
|
|
|
|
#@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_recovering_start_agent_host_is_available(self):
|
|
"""
|
|
Tests HA state turns Recovered when agent is stopped and host is reset
|
|
"""
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
# Then kills the agent and wait untill the state is Degraded
|
|
|
|
self.deploy_vm()
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.kill_agent()
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_degraded()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Reset host so a shut down could be emulated. During the bootup host should transition into recovered state
|
|
self.reset_host()
|
|
|
|
# Waits until Degraded host turns into Recovered for 180 seconds,
|
|
# if it fails it tries to revert host back to Available
|
|
try:
|
|
self.wait_util_host_is_recovered()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# SSH into the KVM Host and executes service cloudstack-agent restart of the agent
|
|
self.start_agent()
|
|
|
|
#@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_fencing_host(self):
|
|
"""
|
|
Tests HA state turns Recovered when agent is stopped and host is reset,
|
|
then configure incorrect OOBM configuration, so that Recover command would fail
|
|
and host would transition into Fenced state.
|
|
"""
|
|
self.deploy_vm()
|
|
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.kill_agent()
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_degraded()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Change OOBM Configuration to invalid so it would fail the recover operations.
|
|
cmd = self.getOobmConfigCmd()
|
|
cmd.address = "1.1.1.1"
|
|
self.apiClient.configureOutOfBandManagement(cmd)
|
|
|
|
# Reset host so a shut down could be emulated. During the bootup host should transition into recovered state
|
|
self.reset_host()
|
|
self.kill_agent()
|
|
|
|
# Waits until Recovering host turns into Fencing for 180 seconds,
|
|
# if it fails it tries to revert host back to Up
|
|
try:
|
|
self.wait_util_host_is_fencing()
|
|
except Exception as e:
|
|
self.ssh_and_restart_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Configure correct OOBM configuration so that the Fencing operation would succeed
|
|
self.apiClient.configureOutOfBandManagement(self.getOobmConfigCmd())
|
|
|
|
# Waits until Fencing host turns into Fenced for 180 seconds,
|
|
# if it fails it tries to revert host back to Up
|
|
try:
|
|
self.wait_util_host_is_fenced()
|
|
except Exception as e:
|
|
self.ssh_and_restart_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# SSH into the KVM Host and executes service cloudstack-agent restart of the agent
|
|
self.ssh_and_restart_agent()
|
|
|
|
# Waits until state is Up so that cleanup would be successful
|
|
self.wait_util_host_is_up()
|
|
|
|
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_kill_agent_host_is_degraded(self):
|
|
"""
|
|
Tests HA state turns Suspect/Checking when some activity/health checks fail
|
|
Configures HA, Logs into to a host and restarts the service
|
|
Then it confirms the ha state jumps through Suspect -> Checking -> Available
|
|
"""
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.ssh_and_restart_agent()
|
|
|
|
# Checks if the host would turn into Suspect in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_suspect()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_checking()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Enable Host
|
|
self.check_host_transition_to_available()
|