bug 12709: add management server active fencing

This commit is contained in:
Kelven Yang 2012-01-04 18:06:15 -08:00
parent 323a07d7e2
commit dcdd87b30f
8 changed files with 369 additions and 2 deletions

View File

@ -0,0 +1,31 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.cluster;
public class ActiveFencingException extends Exception {
private static final long serialVersionUID = -3975376101728211726L;
public ActiveFencingException(String message) {
super(message);
}
public ActiveFencingException(String message, Throwable th) {
super(message, th);
}
}

View File

@ -51,8 +51,10 @@ import com.cloud.agent.api.ChangeAgentCommand;
import com.cloud.agent.api.Command;
import com.cloud.agent.api.PropagateResourceEventCommand;
import com.cloud.agent.manager.Commands;
import com.cloud.cluster.ManagementServerHost.State;
import com.cloud.cluster.agentlb.dao.HostTransferMapDao;
import com.cloud.cluster.dao.ManagementServerHostDao;
import com.cloud.cluster.dao.ManagementServerHostPeerDao;
import com.cloud.configuration.Config;
import com.cloud.configuration.dao.ConfigurationDao;
import com.cloud.exception.AgentUnavailableException;
@ -119,6 +121,7 @@ public class ClusterManagerImpl implements ClusterManager {
private ClusterServiceAdapter _currentServiceAdapter;
private ManagementServerHostDao _mshostDao;
private ManagementServerHostPeerDao _mshostPeerDao;
private HostDao _hostDao;
private HostTransferMapDao _hostTransferDao;
@ -683,6 +686,8 @@ public class ClusterManagerImpl implements ClusterManager {
}
invalidHeartbeatConnection();
} catch(ActiveFencingException e) {
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
} catch (Throwable e) {
if(isRootCauseConnectionRelated(e.getCause())) {
s_logger.error("DB communication problem detected");
@ -804,6 +809,34 @@ public class ClusterManagerImpl implements ClusterManager {
this._notificationMsgs.add(msg);
this._notificationMsgs.notifyAll();
}
switch(msg.getMessageType()) {
case nodeAdded:
{
List<ManagementServerHostVO> l = msg.getNodes();
if(l != null && l.size() > 0) {
for(ManagementServerHostVO mshost: l) {
_mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Up);
}
}
}
break;
case nodeRemoved:
{
List<ManagementServerHostVO> l = msg.getNodes();
if(l != null && l.size() > 0) {
for(ManagementServerHostVO mshost: l) {
_mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Down);
}
}
}
break;
default :
break;
}
}
private ClusterManagerMessage getNextNotificationMessage() {
@ -848,7 +881,7 @@ public class ClusterManagerImpl implements ClusterManager {
}
}
private void peerScan() {
private void peerScan() throws ActiveFencingException {
Date cutTime = DateUtil.currentGMTTime();
List<ManagementServerHostVO> currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
@ -857,6 +890,13 @@ public class ClusterManagerImpl implements ClusterManager {
List<ManagementServerHostVO> invalidatedNodeList = new ArrayList<ManagementServerHostVO>();
if(_mshostId != null) {
if(_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain situation";
s_logger.error(msg);
throw new ActiveFencingException(msg);
}
// only if we have already attached to cluster, will we start to check leaving nodes
for(Map.Entry<Long, ManagementServerHostVO> entry : _activePeers.entrySet()) {
@ -1007,6 +1047,8 @@ public class ClusterManagerImpl implements ClusterManager {
if (s_logger.isInfoEnabled()) {
s_logger.info("Management server (host id : " + _mshostId + ") is being started at " + _clusterNodeIP + ":" + _currentServiceAdapter.getServicePort());
}
_mshostPeerDao.clearPeerInfo(_mshostId);
// use seperate thread for heartbeat updates
_heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), _heartbeatInterval, _heartbeatInterval, TimeUnit.MILLISECONDS);
@ -1067,7 +1109,12 @@ public class ClusterManagerImpl implements ClusterManager {
if (_mshostDao == null) {
throw new ConfigurationException("Unable to get " + ManagementServerHostDao.class.getName());
}
_mshostPeerDao = locator.getDao(ManagementServerHostPeerDao.class);
if (_mshostPeerDao == null) {
throw new ConfigurationException("Unable to get " + ManagementServerHostPeerDao.class.getName());
}
_hostDao = locator.getDao(HostDao.class);
if (_hostDao == null) {
throw new ConfigurationException("Unable to get " + HostDao.class.getName());

View File

@ -0,0 +1,121 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.cluster;
import java.util.Date;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
import com.cloud.utils.DateUtil;
@Entity
@Table(name="mshost_peer")
public class ManagementServerHostPeerVO {
@Id
@GeneratedValue(strategy=GenerationType.IDENTITY)
@Column(name="id")
private long id;
@Column(name="owner_mshost", updatable=true, nullable=false)
private long ownerMshost;
@Column(name="peer_mshost", updatable=true, nullable=false)
private long peerMshost;
@Column(name="peer_runid", updatable=true, nullable=false)
private long peerRunid;
@Column(name="peer_state", updatable = true, nullable=false)
@Enumerated(value=EnumType.STRING)
private ManagementServerHost.State peerState;
@Temporal(TemporalType.TIMESTAMP)
@Column(name="last_update", updatable=true, nullable=true)
private Date lastUpdateTime;
public ManagementServerHostPeerVO() {
}
public ManagementServerHostPeerVO(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) {
this.ownerMshost = ownerMshost;
this.peerMshost = peerMshost;
this.peerRunid = peerRunid;
this.peerState = peerState;
this.lastUpdateTime = DateUtil.currentGMTTime();
}
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public long getOwnerMshost() {
return ownerMshost;
}
public void setOwnerMshost(long ownerMshost) {
this.ownerMshost = ownerMshost;
}
public long getPeerMshost() {
return peerMshost;
}
public void setPeerMshost(long peerMshost) {
this.peerMshost = peerMshost;
}
public long getPeerRunid() {
return peerRunid;
}
public void setPeerRunid(long peerRunid) {
this.peerRunid = peerRunid;
}
public ManagementServerHost.State getPeerState() {
return peerState;
}
public void setPeerState(ManagementServerHost.State peerState) {
this.peerState = peerState;
}
public Date getLastUpdateTime() {
return lastUpdateTime;
}
public void setLastUpdateTime(Date lastUpdateTime) {
this.lastUpdateTime = lastUpdateTime;
}
}

View File

@ -0,0 +1,29 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.cluster.dao;
import com.cloud.cluster.ManagementServerHost;
import com.cloud.cluster.ManagementServerHostPeerVO;
import com.cloud.utils.db.GenericDao;
public interface ManagementServerHostPeerDao extends GenericDao<ManagementServerHostPeerVO, Long> {
void clearPeerInfo(long ownerMshost);
void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState);
int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state);
}

View File

@ -0,0 +1,108 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.cluster.dao;
import java.util.List;
import javax.ejb.Local;
import org.apache.log4j.Logger;
import com.cloud.cluster.ManagementServerHost;
import com.cloud.cluster.ManagementServerHostPeerVO;
import com.cloud.utils.db.DB;
import com.cloud.utils.db.GenericDaoBase;
import com.cloud.utils.db.SearchBuilder;
import com.cloud.utils.db.SearchCriteria;
import com.cloud.utils.db.Transaction;
@Local(value={ManagementServerHostPeerDao.class})
public class ManagementServerHostPeerDaoImpl extends GenericDaoBase<ManagementServerHostPeerVO, Long> implements ManagementServerHostPeerDao {
private static final Logger s_logger = Logger.getLogger(ManagementServerHostPeerDaoImpl.class);
private final SearchBuilder<ManagementServerHostPeerVO> ClearPeerSearch;
private final SearchBuilder<ManagementServerHostPeerVO> FindForUpdateSearch;
private final SearchBuilder<ManagementServerHostPeerVO> CountSearch;
public ManagementServerHostPeerDaoImpl() {
ClearPeerSearch = createSearchBuilder();
ClearPeerSearch.and("ownerMshost", ClearPeerSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ);
ClearPeerSearch.done();
FindForUpdateSearch = createSearchBuilder();
FindForUpdateSearch.and("ownerMshost", FindForUpdateSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ);
FindForUpdateSearch.and("peerMshost", FindForUpdateSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ);
FindForUpdateSearch.and("peerRunid", FindForUpdateSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ);
FindForUpdateSearch.done();
CountSearch = createSearchBuilder();
CountSearch.and("peerMshost", CountSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ);
CountSearch.and("peerRunid", CountSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ);
CountSearch.and("peerState", CountSearch.entity().getPeerState(), SearchCriteria.Op.EQ);
CountSearch.done();
}
@Override
@DB
public void clearPeerInfo(long ownerMshost) {
SearchCriteria<ManagementServerHostPeerVO> sc = ClearPeerSearch.create();
sc.setParameters("ownerMshost", ownerMshost);
expunge(sc);
}
@Override
@DB
public void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) {
Transaction txn = Transaction.currentTxn();
try {
txn.start();
SearchCriteria<ManagementServerHostPeerVO> sc = FindForUpdateSearch.create();
sc.setParameters("ownerMshost", ownerMshost);
sc.setParameters("peerMshost", peerMshost);
sc.setParameters("peerRunid", peerRunid);
List<ManagementServerHostPeerVO> l = listBy(sc);
if(l.size() == 1) {
ManagementServerHostPeerVO peer = l.get(0);
peer.setPeerState(peerState);
update(peer.getId(), peer);
} else {
ManagementServerHostPeerVO peer = new ManagementServerHostPeerVO(ownerMshost, peerMshost, peerRunid, peerState);
persist(peer);
}
txn.commit();
} catch(Exception e) {
s_logger.warn("Unexpected exception, ", e);
txn.rollback();
}
}
@Override
@DB
public int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state) {
SearchCriteria<ManagementServerHostPeerVO> sc = CountSearch.create();
sc.setParameters("peerMshost", mshost);
sc.setParameters("peerRunid", runid);
sc.setParameters("peerState", state);
List<ManagementServerHostPeerVO> l = listBy(sc);
return l.size();
}
}

View File

@ -39,6 +39,7 @@ import com.cloud.cluster.ClusterFenceManagerImpl;
import com.cloud.cluster.ClusterManagerImpl;
import com.cloud.cluster.agentlb.dao.HostTransferMapDaoImpl;
import com.cloud.cluster.dao.ManagementServerHostDaoImpl;
import com.cloud.cluster.dao.ManagementServerHostPeerDaoImpl;
import com.cloud.cluster.dao.StackMaidDaoImpl;
import com.cloud.configuration.dao.ConfigurationDaoImpl;
import com.cloud.configuration.dao.ResourceCountDaoImpl;
@ -262,6 +263,7 @@ public class DefaultComponentLibrary extends ComponentLibraryBase implements Com
addDao("ConsoleProxyDao", ConsoleProxyDaoImpl.class);
addDao("SecondaryStorageVmDao", SecondaryStorageVmDaoImpl.class);
addDao("ManagementServerHostDao", ManagementServerHostDaoImpl.class);
addDao("ManagementServerHostPeerDao", ManagementServerHostPeerDaoImpl.class);
addDao("AgentUpgradeDao", AgentUpgradeDaoImpl.class);
addDao("SnapshotDao", SnapshotDaoImpl.class);
addDao("AsyncJobDao", AsyncJobDaoImpl.class);

View File

@ -789,6 +789,20 @@ CREATE TABLE `cloud`.`mshost` (
INDEX `i_mshost__last_update`(`last_update`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `cloud`.`mshost_peer` (
`id` bigint unsigned NOT NULL auto_increment,
`owner_mshost` bigint unsigned NOT NULL,
`peer_mshost` bigint unsigned NOT NULL,
`peer_runid` bigint NOT NULL,
`peer_state` varchar(10) NOT NULL DEFAULT 'Down',
`last_update` DATETIME NULL COMMENT 'Last record update time',
PRIMARY KEY (`id`),
CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE,
CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`),
UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `cloud`.`host_tags` (
`id` bigint unsigned NOT NULL auto_increment,
`host_id` bigint unsigned NOT NULL COMMENT 'host id',

View File

@ -4,3 +4,18 @@
ALTER TABLE `cloud`.`vm_template` MODIFY `extractable` int(1) unsigned NOT NULL default 0 COMMENT 'Is this template extractable';
INSERT INTO configuration (category, instance, component, name, value, description) VALUES ('Advanced', 'DEFAULT', 'management-server', 'external.network.stats.interval', '300', 'Interval (in seconds) to report external network statistics.');
CREATE TABLE `cloud`.`mshost_peer` (
`id` bigint unsigned NOT NULL auto_increment,
`owner_mshost` bigint unsigned NOT NULL,
`peer_mshost` bigint unsigned NOT NULL,
`peer_runid` bigint NOT NULL,
`peer_state` varchar(10) NOT NULL DEFAULT 'Down',
`last_update` DATETIME NULL COMMENT 'Last record update time',
PRIMARY KEY (`id`),
CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE,
CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`),
UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;