mirror of
https://github.com/apache/cloudstack.git
synced 2025-11-02 11:52:28 +01:00
bug 12709: add management server active fencing
This commit is contained in:
parent
323a07d7e2
commit
dcdd87b30f
31
server/src/com/cloud/cluster/ActiveFencingException.java
Normal file
31
server/src/com/cloud/cluster/ActiveFencingException.java
Normal file
@ -0,0 +1,31 @@
|
||||
/**
|
||||
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the GNU General Public License v3 or later.
|
||||
*
|
||||
* It is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or any later version.
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
package com.cloud.cluster;
|
||||
|
||||
public class ActiveFencingException extends Exception {
|
||||
private static final long serialVersionUID = -3975376101728211726L;
|
||||
|
||||
public ActiveFencingException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public ActiveFencingException(String message, Throwable th) {
|
||||
super(message, th);
|
||||
}
|
||||
}
|
||||
@ -51,8 +51,10 @@ import com.cloud.agent.api.ChangeAgentCommand;
|
||||
import com.cloud.agent.api.Command;
|
||||
import com.cloud.agent.api.PropagateResourceEventCommand;
|
||||
import com.cloud.agent.manager.Commands;
|
||||
import com.cloud.cluster.ManagementServerHost.State;
|
||||
import com.cloud.cluster.agentlb.dao.HostTransferMapDao;
|
||||
import com.cloud.cluster.dao.ManagementServerHostDao;
|
||||
import com.cloud.cluster.dao.ManagementServerHostPeerDao;
|
||||
import com.cloud.configuration.Config;
|
||||
import com.cloud.configuration.dao.ConfigurationDao;
|
||||
import com.cloud.exception.AgentUnavailableException;
|
||||
@ -119,6 +121,7 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
private ClusterServiceAdapter _currentServiceAdapter;
|
||||
|
||||
private ManagementServerHostDao _mshostDao;
|
||||
private ManagementServerHostPeerDao _mshostPeerDao;
|
||||
private HostDao _hostDao;
|
||||
private HostTransferMapDao _hostTransferDao;
|
||||
|
||||
@ -683,6 +686,8 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
}
|
||||
|
||||
invalidHeartbeatConnection();
|
||||
} catch(ActiveFencingException e) {
|
||||
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
|
||||
} catch (Throwable e) {
|
||||
if(isRootCauseConnectionRelated(e.getCause())) {
|
||||
s_logger.error("DB communication problem detected");
|
||||
@ -804,6 +809,34 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
this._notificationMsgs.add(msg);
|
||||
this._notificationMsgs.notifyAll();
|
||||
}
|
||||
|
||||
switch(msg.getMessageType()) {
|
||||
case nodeAdded:
|
||||
{
|
||||
List<ManagementServerHostVO> l = msg.getNodes();
|
||||
if(l != null && l.size() > 0) {
|
||||
for(ManagementServerHostVO mshost: l) {
|
||||
_mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Up);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case nodeRemoved:
|
||||
{
|
||||
List<ManagementServerHostVO> l = msg.getNodes();
|
||||
if(l != null && l.size() > 0) {
|
||||
for(ManagementServerHostVO mshost: l) {
|
||||
_mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Down);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default :
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private ClusterManagerMessage getNextNotificationMessage() {
|
||||
@ -848,7 +881,7 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
}
|
||||
}
|
||||
|
||||
private void peerScan() {
|
||||
private void peerScan() throws ActiveFencingException {
|
||||
Date cutTime = DateUtil.currentGMTTime();
|
||||
|
||||
List<ManagementServerHostVO> currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
|
||||
@ -857,6 +890,13 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
List<ManagementServerHostVO> invalidatedNodeList = new ArrayList<ManagementServerHostVO>();
|
||||
|
||||
if(_mshostId != null) {
|
||||
|
||||
if(_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
|
||||
String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain situation";
|
||||
s_logger.error(msg);
|
||||
throw new ActiveFencingException(msg);
|
||||
}
|
||||
|
||||
// only if we have already attached to cluster, will we start to check leaving nodes
|
||||
for(Map.Entry<Long, ManagementServerHostVO> entry : _activePeers.entrySet()) {
|
||||
|
||||
@ -1007,6 +1047,8 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
if (s_logger.isInfoEnabled()) {
|
||||
s_logger.info("Management server (host id : " + _mshostId + ") is being started at " + _clusterNodeIP + ":" + _currentServiceAdapter.getServicePort());
|
||||
}
|
||||
|
||||
_mshostPeerDao.clearPeerInfo(_mshostId);
|
||||
|
||||
// use seperate thread for heartbeat updates
|
||||
_heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), _heartbeatInterval, _heartbeatInterval, TimeUnit.MILLISECONDS);
|
||||
@ -1067,7 +1109,12 @@ public class ClusterManagerImpl implements ClusterManager {
|
||||
if (_mshostDao == null) {
|
||||
throw new ConfigurationException("Unable to get " + ManagementServerHostDao.class.getName());
|
||||
}
|
||||
|
||||
|
||||
_mshostPeerDao = locator.getDao(ManagementServerHostPeerDao.class);
|
||||
if (_mshostPeerDao == null) {
|
||||
throw new ConfigurationException("Unable to get " + ManagementServerHostPeerDao.class.getName());
|
||||
}
|
||||
|
||||
_hostDao = locator.getDao(HostDao.class);
|
||||
if (_hostDao == null) {
|
||||
throw new ConfigurationException("Unable to get " + HostDao.class.getName());
|
||||
|
||||
121
server/src/com/cloud/cluster/ManagementServerHostPeerVO.java
Normal file
121
server/src/com/cloud/cluster/ManagementServerHostPeerVO.java
Normal file
@ -0,0 +1,121 @@
|
||||
/**
|
||||
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the GNU General Public License v3 or later.
|
||||
*
|
||||
* It is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or any later version.
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
package com.cloud.cluster;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.EnumType;
|
||||
import javax.persistence.Enumerated;
|
||||
import javax.persistence.GeneratedValue;
|
||||
import javax.persistence.GenerationType;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Table;
|
||||
import javax.persistence.Temporal;
|
||||
import javax.persistence.TemporalType;
|
||||
|
||||
import com.cloud.utils.DateUtil;
|
||||
|
||||
@Entity
|
||||
@Table(name="mshost_peer")
|
||||
public class ManagementServerHostPeerVO {
|
||||
|
||||
@Id
|
||||
@GeneratedValue(strategy=GenerationType.IDENTITY)
|
||||
@Column(name="id")
|
||||
private long id;
|
||||
|
||||
@Column(name="owner_mshost", updatable=true, nullable=false)
|
||||
private long ownerMshost;
|
||||
|
||||
@Column(name="peer_mshost", updatable=true, nullable=false)
|
||||
private long peerMshost;
|
||||
|
||||
@Column(name="peer_runid", updatable=true, nullable=false)
|
||||
private long peerRunid;
|
||||
|
||||
@Column(name="peer_state", updatable = true, nullable=false)
|
||||
@Enumerated(value=EnumType.STRING)
|
||||
private ManagementServerHost.State peerState;
|
||||
|
||||
@Temporal(TemporalType.TIMESTAMP)
|
||||
@Column(name="last_update", updatable=true, nullable=true)
|
||||
private Date lastUpdateTime;
|
||||
|
||||
public ManagementServerHostPeerVO() {
|
||||
}
|
||||
|
||||
public ManagementServerHostPeerVO(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) {
|
||||
this.ownerMshost = ownerMshost;
|
||||
this.peerMshost = peerMshost;
|
||||
this.peerRunid = peerRunid;
|
||||
this.peerState = peerState;
|
||||
|
||||
this.lastUpdateTime = DateUtil.currentGMTTime();
|
||||
}
|
||||
|
||||
public long getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(long id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public long getOwnerMshost() {
|
||||
return ownerMshost;
|
||||
}
|
||||
|
||||
public void setOwnerMshost(long ownerMshost) {
|
||||
this.ownerMshost = ownerMshost;
|
||||
}
|
||||
|
||||
public long getPeerMshost() {
|
||||
return peerMshost;
|
||||
}
|
||||
|
||||
public void setPeerMshost(long peerMshost) {
|
||||
this.peerMshost = peerMshost;
|
||||
}
|
||||
|
||||
public long getPeerRunid() {
|
||||
return peerRunid;
|
||||
}
|
||||
|
||||
public void setPeerRunid(long peerRunid) {
|
||||
this.peerRunid = peerRunid;
|
||||
}
|
||||
|
||||
public ManagementServerHost.State getPeerState() {
|
||||
return peerState;
|
||||
}
|
||||
|
||||
public void setPeerState(ManagementServerHost.State peerState) {
|
||||
this.peerState = peerState;
|
||||
}
|
||||
|
||||
public Date getLastUpdateTime() {
|
||||
return lastUpdateTime;
|
||||
}
|
||||
|
||||
public void setLastUpdateTime(Date lastUpdateTime) {
|
||||
this.lastUpdateTime = lastUpdateTime;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
/**
|
||||
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the GNU General Public License v3 or later.
|
||||
*
|
||||
* It is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or any later version.
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
package com.cloud.cluster.dao;
|
||||
|
||||
import com.cloud.cluster.ManagementServerHost;
|
||||
import com.cloud.cluster.ManagementServerHostPeerVO;
|
||||
import com.cloud.utils.db.GenericDao;
|
||||
|
||||
public interface ManagementServerHostPeerDao extends GenericDao<ManagementServerHostPeerVO, Long> {
|
||||
void clearPeerInfo(long ownerMshost);
|
||||
void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState);
|
||||
int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state);
|
||||
}
|
||||
@ -0,0 +1,108 @@
|
||||
/**
|
||||
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the GNU General Public License v3 or later.
|
||||
*
|
||||
* It is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or any later version.
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
|
||||
package com.cloud.cluster.dao;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import javax.ejb.Local;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.cloud.cluster.ManagementServerHost;
|
||||
import com.cloud.cluster.ManagementServerHostPeerVO;
|
||||
import com.cloud.utils.db.DB;
|
||||
import com.cloud.utils.db.GenericDaoBase;
|
||||
import com.cloud.utils.db.SearchBuilder;
|
||||
import com.cloud.utils.db.SearchCriteria;
|
||||
import com.cloud.utils.db.Transaction;
|
||||
|
||||
@Local(value={ManagementServerHostPeerDao.class})
|
||||
public class ManagementServerHostPeerDaoImpl extends GenericDaoBase<ManagementServerHostPeerVO, Long> implements ManagementServerHostPeerDao {
|
||||
private static final Logger s_logger = Logger.getLogger(ManagementServerHostPeerDaoImpl.class);
|
||||
|
||||
private final SearchBuilder<ManagementServerHostPeerVO> ClearPeerSearch;
|
||||
private final SearchBuilder<ManagementServerHostPeerVO> FindForUpdateSearch;
|
||||
private final SearchBuilder<ManagementServerHostPeerVO> CountSearch;
|
||||
|
||||
public ManagementServerHostPeerDaoImpl() {
|
||||
ClearPeerSearch = createSearchBuilder();
|
||||
ClearPeerSearch.and("ownerMshost", ClearPeerSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ);
|
||||
ClearPeerSearch.done();
|
||||
|
||||
FindForUpdateSearch = createSearchBuilder();
|
||||
FindForUpdateSearch.and("ownerMshost", FindForUpdateSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ);
|
||||
FindForUpdateSearch.and("peerMshost", FindForUpdateSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ);
|
||||
FindForUpdateSearch.and("peerRunid", FindForUpdateSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ);
|
||||
FindForUpdateSearch.done();
|
||||
|
||||
CountSearch = createSearchBuilder();
|
||||
CountSearch.and("peerMshost", CountSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ);
|
||||
CountSearch.and("peerRunid", CountSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ);
|
||||
CountSearch.and("peerState", CountSearch.entity().getPeerState(), SearchCriteria.Op.EQ);
|
||||
CountSearch.done();
|
||||
}
|
||||
|
||||
@Override
|
||||
@DB
|
||||
public void clearPeerInfo(long ownerMshost) {
|
||||
SearchCriteria<ManagementServerHostPeerVO> sc = ClearPeerSearch.create();
|
||||
sc.setParameters("ownerMshost", ownerMshost);
|
||||
|
||||
expunge(sc);
|
||||
}
|
||||
|
||||
@Override
|
||||
@DB
|
||||
public void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) {
|
||||
Transaction txn = Transaction.currentTxn();
|
||||
try {
|
||||
txn.start();
|
||||
|
||||
SearchCriteria<ManagementServerHostPeerVO> sc = FindForUpdateSearch.create();
|
||||
sc.setParameters("ownerMshost", ownerMshost);
|
||||
sc.setParameters("peerMshost", peerMshost);
|
||||
sc.setParameters("peerRunid", peerRunid);
|
||||
List<ManagementServerHostPeerVO> l = listBy(sc);
|
||||
if(l.size() == 1) {
|
||||
ManagementServerHostPeerVO peer = l.get(0);
|
||||
peer.setPeerState(peerState);
|
||||
update(peer.getId(), peer);
|
||||
} else {
|
||||
ManagementServerHostPeerVO peer = new ManagementServerHostPeerVO(ownerMshost, peerMshost, peerRunid, peerState);
|
||||
persist(peer);
|
||||
}
|
||||
txn.commit();
|
||||
} catch(Exception e) {
|
||||
s_logger.warn("Unexpected exception, ", e);
|
||||
txn.rollback();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@DB
|
||||
public int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state) {
|
||||
SearchCriteria<ManagementServerHostPeerVO> sc = CountSearch.create();
|
||||
sc.setParameters("peerMshost", mshost);
|
||||
sc.setParameters("peerRunid", runid);
|
||||
sc.setParameters("peerState", state);
|
||||
|
||||
List<ManagementServerHostPeerVO> l = listBy(sc);
|
||||
return l.size();
|
||||
}
|
||||
}
|
||||
@ -39,6 +39,7 @@ import com.cloud.cluster.ClusterFenceManagerImpl;
|
||||
import com.cloud.cluster.ClusterManagerImpl;
|
||||
import com.cloud.cluster.agentlb.dao.HostTransferMapDaoImpl;
|
||||
import com.cloud.cluster.dao.ManagementServerHostDaoImpl;
|
||||
import com.cloud.cluster.dao.ManagementServerHostPeerDaoImpl;
|
||||
import com.cloud.cluster.dao.StackMaidDaoImpl;
|
||||
import com.cloud.configuration.dao.ConfigurationDaoImpl;
|
||||
import com.cloud.configuration.dao.ResourceCountDaoImpl;
|
||||
@ -262,6 +263,7 @@ public class DefaultComponentLibrary extends ComponentLibraryBase implements Com
|
||||
addDao("ConsoleProxyDao", ConsoleProxyDaoImpl.class);
|
||||
addDao("SecondaryStorageVmDao", SecondaryStorageVmDaoImpl.class);
|
||||
addDao("ManagementServerHostDao", ManagementServerHostDaoImpl.class);
|
||||
addDao("ManagementServerHostPeerDao", ManagementServerHostPeerDaoImpl.class);
|
||||
addDao("AgentUpgradeDao", AgentUpgradeDaoImpl.class);
|
||||
addDao("SnapshotDao", SnapshotDaoImpl.class);
|
||||
addDao("AsyncJobDao", AsyncJobDaoImpl.class);
|
||||
|
||||
@ -789,6 +789,20 @@ CREATE TABLE `cloud`.`mshost` (
|
||||
INDEX `i_mshost__last_update`(`last_update`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `cloud`.`mshost_peer` (
|
||||
`id` bigint unsigned NOT NULL auto_increment,
|
||||
`owner_mshost` bigint unsigned NOT NULL,
|
||||
`peer_mshost` bigint unsigned NOT NULL,
|
||||
`peer_runid` bigint NOT NULL,
|
||||
`peer_state` varchar(10) NOT NULL DEFAULT 'Down',
|
||||
`last_update` DATETIME NULL COMMENT 'Last record update time',
|
||||
|
||||
PRIMARY KEY (`id`),
|
||||
CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE,
|
||||
CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`),
|
||||
UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
CREATE TABLE `cloud`.`host_tags` (
|
||||
`id` bigint unsigned NOT NULL auto_increment,
|
||||
`host_id` bigint unsigned NOT NULL COMMENT 'host id',
|
||||
|
||||
@ -4,3 +4,18 @@
|
||||
|
||||
ALTER TABLE `cloud`.`vm_template` MODIFY `extractable` int(1) unsigned NOT NULL default 0 COMMENT 'Is this template extractable';
|
||||
INSERT INTO configuration (category, instance, component, name, value, description) VALUES ('Advanced', 'DEFAULT', 'management-server', 'external.network.stats.interval', '300', 'Interval (in seconds) to report external network statistics.');
|
||||
|
||||
CREATE TABLE `cloud`.`mshost_peer` (
|
||||
`id` bigint unsigned NOT NULL auto_increment,
|
||||
`owner_mshost` bigint unsigned NOT NULL,
|
||||
`peer_mshost` bigint unsigned NOT NULL,
|
||||
`peer_runid` bigint NOT NULL,
|
||||
`peer_state` varchar(10) NOT NULL DEFAULT 'Down',
|
||||
`last_update` DATETIME NULL COMMENT 'Last record update time',
|
||||
|
||||
PRIMARY KEY (`id`),
|
||||
CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE,
|
||||
CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`),
|
||||
UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user