CLOUDSTACK-10188 - Resource Accounting for primary storage is Broken when Domains are in use (#2362)

During storage expunge domain resource statistics for primary storage space resource counter is not updated for domain. This leads to the situation when domain resource statistics for primary storage is overfilled (statistics only increase but not decrease).

Global scheduled task resourcecount.check.interval > 0 provides a workaround but not fixes the problem truly because when accounts inside domains use primary_storage allocation/deallocation intensively it leads to service block of operation.

NB: Unable to implement marvin tests because it (marvin) places in database weird primary storage volume size of 100 when creating VM from template. It might be a sign of opening a new issue for that bug.
This commit is contained in:
Bitworks Software, Ltd 2018-01-10 23:41:26 +07:00 committed by Rohit Yadav
parent 64832fd70a
commit 7ca4582a85
6 changed files with 226 additions and 201 deletions

View File

@ -28,7 +28,7 @@ import org.apache.cloudstack.framework.config.ConfigKey;
public interface ResourceLimitService {
static final ConfigKey<Long> ResourceCountCheckInterval = new ConfigKey<Long>("Advanced", Long.class, "resourcecount.check.interval", "300",
"Time (in seconds) to wait before retrying resource count check task. Default is 300, Setting this to 0 will not run the task", false);
"Time (in seconds) to wait before running resource recalculation and fixing task. Default is 300 seconds, Setting this to 0 disables execution of the task", false);
/**
* Updates an existing resource limit with the specified details. If a limit doesn't exist, will create one.

View File

@ -1576,8 +1576,7 @@ public class VolumeOrchestrator extends ManagerBase implements VolumeOrchestrati
UsageEventUtils.publishUsageEvent(EventTypes.EVENT_VOLUME_DELETE, volume.getAccountId(), volume.getDataCenterId(), volume.getId(), volume.getName(),
Volume.class.getName(), volume.getUuid(), volume.isDisplayVolume());
_resourceLimitMgr.decrementResourceCount(volume.getAccountId(), ResourceType.volume, volume.isDisplay());
//FIXME - why recalculate and not decrement
_resourceLimitMgr.recalculateResourceCount(volume.getAccountId(), volume.getDomainId(), ResourceType.primary_storage.getOrdinal());
_resourceLimitMgr.decrementResourceCount(volume.getAccountId(), ResourceType.primary_storage, volume.isDisplay(), new Long(volume.getSize()));
} catch (Exception e) {
s_logger.debug("Failed to destroy volume" + volume.getId(), e);
throw new CloudRuntimeException("Failed to destroy volume" + volume.getId(), e);

View File

@ -107,7 +107,7 @@ import com.cloud.vm.dao.UserVmDao;
import com.cloud.vm.dao.VMInstanceDao;
@Component
public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLimitService, Configurable{
public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLimitService, Configurable {
public static final Logger s_logger = Logger.getLogger(ResourceLimitManagerImpl.class);
@Inject
@ -410,6 +410,81 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
return max;
}
private void checkDomainResourceLimit(final Account account, final Project project, final ResourceType type, long numResources) throws ResourceAllocationException {
// check all domains in the account's domain hierarchy
Long domainId = null;
if (project != null) {
domainId = project.getDomainId();
} else {
domainId = account.getDomainId();
}
while (domainId != null) {
DomainVO domain = _domainDao.findById(domainId);
// no limit check if it is ROOT domain
if (domainId != Domain.ROOT_DOMAIN) {
long domainResourceLimit = findCorrectResourceLimitForDomain(domain, type);
long currentDomainResourceCount = _resourceCountDao.getResourceCount(domainId, ResourceOwnerType.Domain, type);
long requestedDomainResourceCount = currentDomainResourceCount + numResources;
String messageSuffix = " domain resource limits of Type '" + type + "'" +
" for Domain Id = " + domainId +
" is exceeded: Domain Resource Limit = " + domainResourceLimit +
", Current Domain Resource Amount = " + currentDomainResourceCount +
", Requested Resource Amount = " + numResources + ".";
if(s_logger.isDebugEnabled()) {
s_logger.debug("Checking if" + messageSuffix);
}
if (domainResourceLimit != Resource.RESOURCE_UNLIMITED && requestedDomainResourceCount > domainResourceLimit) {
String message = "Maximum" + messageSuffix;
ResourceAllocationException e = new ResourceAllocationException(message, type);
s_logger.error(message, e);
throw e;
}
}
domainId = domain.getParent();
}
}
private void checkAccountResourceLimit(final Account account, final Project project, final ResourceType type, long numResources) throws ResourceAllocationException {
// Check account limits
long accountResourceLimit = findCorrectResourceLimitForAccount(account, type);
long currentResourceCount = _resourceCountDao.getResourceCount(account.getId(), ResourceOwnerType.Account, type);
long requestedResourceCount = currentResourceCount + numResources;
String messageSuffix = " amount of resources of Type = '" + type + "' for " +
(project == null ? "Account Name = " + account.getAccountName() : "Project Name = " + project.getName()) +
" in Domain Id = " + account.getDomainId() +
" is exceeded: Account Resource Limit = " + accountResourceLimit +
", Current Account Resource Amount = " + currentResourceCount +
", Requested Resource Amount = " + numResources + ".";
if(s_logger.isDebugEnabled()) {
s_logger.debug("Checking if" + messageSuffix);
}
if (accountResourceLimit != Resource.RESOURCE_UNLIMITED && requestedResourceCount > accountResourceLimit) {
String message = "Maximum" + messageSuffix;
ResourceAllocationException e = new ResourceAllocationException(message, type);
s_logger.error(message, e);
throw e;
}
}
private List<ResourceCountVO> lockAccountAndOwnerDomainRows(long accountId, final ResourceType type) {
Set<Long> rowIdsToLock = _resourceCountDao.listAllRowsToUpdate(accountId, ResourceOwnerType.Account, type);
SearchCriteria<ResourceCountVO> sc = ResourceCountSearch.create();
sc.setParameters("id", rowIdsToLock.toArray());
return _resourceCountDao.lockRows(sc, null, true);
}
private List<ResourceCountVO> lockDomainRows(long domainId, final ResourceType type) {
Set<Long> rowIdsToLock = _resourceCountDao.listAllRowsToUpdate(domainId, ResourceOwnerType.Domain, type);
SearchCriteria<ResourceCountVO> sc = ResourceCountSearch.create();
sc.setParameters("id", rowIdsToLock.toArray());
return _resourceCountDao.lockRows(sc, null, true);
}
public long findDefaultResourceLimitForDomain(ResourceType resourceType) {
Long resourceLimit = null;
resourceLimit = domainResourceLimitMap.get(resourceType);
@ -441,48 +516,11 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
@Override
public void doInTransactionWithoutResult(TransactionStatus status) throws ResourceAllocationException {
// Lock all rows first so nobody else can read it
Set<Long> rowIdsToLock = _resourceCountDao.listAllRowsToUpdate(account.getId(), ResourceOwnerType.Account, type);
SearchCriteria<ResourceCountVO> sc = ResourceCountSearch.create();
sc.setParameters("id", rowIdsToLock.toArray());
_resourceCountDao.lockRows(sc, null, true);
lockAccountAndOwnerDomainRows(account.getId(), type);
// Check account limits
long accountLimit = findCorrectResourceLimitForAccount(account, type);
long potentialCount = _resourceCountDao.getResourceCount(account.getId(), ResourceOwnerType.Account, type) + numResources;
if (accountLimit != Resource.RESOURCE_UNLIMITED && potentialCount > accountLimit) {
String message =
"Maximum number of resources of type '" + type + "' for account name=" + account.getAccountName() + " in domain id=" + account.getDomainId() +
" has been exceeded.";
if (projectFinal != null) {
message =
"Maximum number of resources of type '" + type + "' for project name=" + projectFinal.getName() + " in domain id=" + account.getDomainId() +
" has been exceeded.";
}
ResourceAllocationException e= new ResourceAllocationException(message, type);;
s_logger.error(message, e);
throw e;
}
checkAccountResourceLimit(account, projectFinal, type, numResources);
// check all domains in the account's domain hierarchy
Long domainId = null;
if (projectFinal != null) {
domainId = projectFinal.getDomainId();
} else {
domainId = account.getDomainId();
}
while (domainId != null) {
DomainVO domain = _domainDao.findById(domainId);
// no limit check if it is ROOT domain
if (domainId != Domain.ROOT_DOMAIN) {
long domainLimit = findCorrectResourceLimitForDomain(domain, type);
long domainCount = _resourceCountDao.getResourceCount(domainId, ResourceOwnerType.Domain, type) + numResources;
if (domainLimit != Resource.RESOURCE_UNLIMITED && domainCount > domainLimit) {
throw new ResourceAllocationException("Maximum number of resources of type '" + type + "' for domain id=" + domainId + " has been exceeded.", type);
}
}
domainId = domain.getParent();
}
checkDomainResourceLimit(account, projectFinal, type, numResources);
}
});
}
@ -650,7 +688,7 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
//Convert max storage size from GiB to bytes
if ((resourceType == ResourceType.primary_storage || resourceType == ResourceType.secondary_storage) && max >= 0) {
max = max * ResourceType.bytesToGiB;
max *= ResourceType.bytesToGiB;
}
ResourceOwnerType ownerType = null;
@ -780,25 +818,22 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
@DB
protected boolean updateResourceCountForAccount(final long accountId, final ResourceType type, final boolean increment, final long delta) {
if(s_logger.isDebugEnabled()) {
s_logger.debug("Updating resource Type = " + type + " count for Account = " + accountId +
" Operation = " + (increment ? "increasing" : "decreasing") + " Amount = " + delta);
}
try {
return Transaction.execute(new TransactionCallback<Boolean>() {
@Override
public Boolean doInTransaction(TransactionStatus status) {
boolean result = true;
Set<Long> rowsToLock = _resourceCountDao.listAllRowsToUpdate(accountId, ResourceOwnerType.Account, type);
// Lock rows first
SearchCriteria<ResourceCountVO> sc = ResourceCountSearch.create();
sc.setParameters("id", rowsToLock.toArray());
List<ResourceCountVO> rowsToUpdate = _resourceCountDao.lockRows(sc, null, true);
List<ResourceCountVO> rowsToUpdate = lockAccountAndOwnerDomainRows(accountId, type);
for (ResourceCountVO rowToUpdate : rowsToUpdate) {
if (!_resourceCountDao.updateById(rowToUpdate.getId(), increment, delta)) {
s_logger.trace("Unable to update resource count for the row " + rowToUpdate);
result = false;
}
}
return result;
}
});
@ -813,16 +848,10 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
return Transaction.execute(new TransactionCallback<Long>() {
@Override
public Long doInTransaction(TransactionStatus status) {
long newCount = 0;
// Lock all rows first so nobody else can read it
Set<Long> rowIdsToLock = _resourceCountDao.listAllRowsToUpdate(domainId, ResourceOwnerType.Domain, type);
SearchCriteria<ResourceCountVO> sc = ResourceCountSearch.create();
sc.setParameters("id", rowIdsToLock.toArray());
_resourceCountDao.lockRows(sc, null, true);
long newResourceCount = 0;
lockDomainRows(domainId, type);
ResourceCountVO domainRC = _resourceCountDao.findByOwnerAndType(domainId, ResourceOwnerType.Domain, type);
long oldCount = domainRC.getCount();
long oldResourceCount = domainRC.getCount();
List<DomainVO> domainChildren = _domainDao.findImmediateChildrenForParent(domainId);
// for each child domain update the resource count
@ -830,30 +859,31 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
// calculate project count here
if (type == ResourceType.project) {
newCount = newCount + _projectDao.countProjectsForDomain(domainId);
newResourceCount += _projectDao.countProjectsForDomain(domainId);
}
for (DomainVO domainChild : domainChildren) {
long domainCount = recalculateDomainResourceCount(domainChild.getId(), type);
newCount = newCount + domainCount; // add the child domain count to parent domain count
for (DomainVO childDomain : domainChildren) {
long childDomainResourceCount = recalculateDomainResourceCount(childDomain.getId(), type);
newResourceCount += childDomainResourceCount; // add the child domain count to parent domain count
}
}
if (type.supportsOwner(ResourceOwnerType.Account)) {
List<AccountVO> accounts = _accountDao.findActiveAccountsForDomain(domainId);
for (AccountVO account : accounts) {
long accountCount = recalculateAccountResourceCount(account.getId(), type);
newCount = newCount + accountCount; // add account's resource count to parent domain count
long accountResourceCount = recalculateAccountResourceCount(account.getId(), type);
newResourceCount += accountResourceCount; // add account's resource count to parent domain count
}
}
_resourceCountDao.setResourceCount(domainId, ResourceOwnerType.Domain, type, newCount);
_resourceCountDao.setResourceCount(domainId, ResourceOwnerType.Domain, type, newResourceCount);
if (oldCount != newCount) {
s_logger.info("Discrepency in the resource count " + "(original count=" + oldCount + " correct count = " + newCount + ") for type " + type +
" for domain ID " + domainId + " is fixed during resource count recalculation.");
if (oldResourceCount != newResourceCount) {
s_logger.warn("Discrepency in the resource count has been detected " + "(original count = " + oldResourceCount +
" correct count = " + newResourceCount + ") for Type = " + type +
" for Domain ID = " + domainId + " is fixed during resource count recalculation.");
}
return newCount;
return newResourceCount;
}
});
}
@ -864,19 +894,11 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
@Override
public Long doInTransaction(TransactionStatus status) {
Long newCount = null;
// this lock guards against the updates to user_vm, volume, snapshot, public _ip and template table
// as any resource creation precedes with the resourceLimitExceeded check which needs this lock too
Set rowIdsToLock = _resourceCountDao.listAllRowsToUpdate(accountId, Resource.ResourceOwnerType.Account, type);
SearchCriteria<ResourceCountVO> sc = ResourceCountSearch.create();
sc.setParameters("id", rowIdsToLock.toArray());
_resourceCountDao.lockRows(sc, null, true);
lockAccountAndOwnerDomainRows(accountId, type);
ResourceCountVO accountRC = _resourceCountDao.findByOwnerAndType(accountId, ResourceOwnerType.Account, type);
long oldCount = 0;
if (accountRC != null) {
if (accountRC != null)
oldCount = accountRC.getCount();
}
if (type == Resource.ResourceType.user_vm) {
newCount = _userVmDao.countAllocatedVMsForAccount(accountId);
@ -910,12 +932,13 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
}
_resourceCountDao.setResourceCount(accountId, ResourceOwnerType.Account, type, (newCount == null) ? 0 : newCount.longValue());
// No need to log message for primary and secondary storage because both are recalculating the resource count which will not lead to any discrepancy.
if (!Long.valueOf(oldCount).equals(newCount) && (type != Resource.ResourceType.primary_storage && type != Resource.ResourceType.secondary_storage)) {
s_logger.info("Discrepency in the resource count " + "(original count=" + oldCount + " correct count = " + newCount + ") for type " + type +
// No need to log message for primary and secondary storage because both are recalculating the
// resource count which will not lead to any discrepancy.
if (!Long.valueOf(oldCount).equals(newCount) &&
(type != Resource.ResourceType.primary_storage && type != Resource.ResourceType.secondary_storage)) {
s_logger.warn("Discrepency in the resource count " + "(original count=" + oldCount + " correct count = " + newCount + ") for type " + type +
" for account ID " + accountId + " is fixed during resource count recalculation.");
}
return newCount;
}
});
@ -1082,7 +1105,7 @@ public class ResourceLimitManagerImpl extends ManagerBase implements ResourceLim
@Override
protected void runInContext() {
s_logger.info("Running resource count check periodic task");
s_logger.info("Started resource counters recalculation periodic task.");
List<DomainVO> domains = _domainDao.findImmediateChildrenForParent(Domain.ROOT_DOMAIN);
// recalculateDomainResourceCount will take care of re-calculation of resource counts for sub-domains

View File

@ -816,7 +816,7 @@ public class VolumeApiServiceImpl extends ManagerBase implements VolumeApiServic
if (!created) {
s_logger.trace("Decrementing volume resource count for account id=" + volume.getAccountId() + " as volume failed to create on the backend");
_resourceLimitMgr.decrementResourceCount(volume.getAccountId(), ResourceType.volume, cmd.getDisplayVolume());
_resourceLimitMgr.recalculateResourceCount(volume.getAccountId(), volume.getDomainId(), ResourceType.primary_storage.getOrdinal());
_resourceLimitMgr.decrementResourceCount(volume.getAccountId(), ResourceType.primary_storage, cmd.getDisplayVolume(), new Long(volume.getSize()));
}
}
}
@ -1262,6 +1262,7 @@ public class VolumeApiServiceImpl extends ManagerBase implements VolumeApiServic
if (instanceId == null || (vmInstance.getType().equals(VirtualMachine.Type.User))) {
// Decrement the resource count for volumes and primary storage belonging user VM's only
_resourceLimitMgr.decrementResourceCount(volume.getAccountId(), ResourceType.volume, volume.isDisplayVolume());
_resourceLimitMgr.decrementResourceCount(volume.getAccountId(), ResourceType.primary_storage, volume.isDisplayVolume(), new Long(volume.getSize()));
}
}
// Mark volume as removed if volume has not been created on primary or secondary
@ -1277,7 +1278,8 @@ public class VolumeApiServiceImpl extends ManagerBase implements VolumeApiServic
AsyncCallFuture<VolumeApiResult> future = volService.expungeVolumeAsync(volOnPrimary);
future.get();
//decrement primary storage count
_resourceLimitMgr.recalculateResourceCount(volume.getAccountId(), volume.getDomainId(), ResourceType.primary_storage.getOrdinal());
_resourceLimitMgr.decrementResourceCount(volOnPrimary.getAccountId(), ResourceType.volume, volOnPrimary.isDisplayVolume());
_resourceLimitMgr.decrementResourceCount(volOnPrimary.getAccountId(), ResourceType.primary_storage, volOnPrimary.isDisplayVolume(), new Long(volOnPrimary.getSize()));
}
// expunge volume from secondary if volume is on image store
VolumeInfo volOnSecondary = volFactory.getVolume(volume.getId(), DataStoreRole.Image);
@ -1286,7 +1288,7 @@ public class VolumeApiServiceImpl extends ManagerBase implements VolumeApiServic
AsyncCallFuture<VolumeApiResult> future2 = volService.expungeVolumeAsync(volOnSecondary);
future2.get();
//decrement secondary storage count
_resourceLimitMgr.recalculateResourceCount(volume.getAccountId(), volume.getDomainId(), ResourceType.secondary_storage.getOrdinal());
_resourceLimitMgr.decrementResourceCount(volOnSecondary.getAccountId(), ResourceType.secondary_storage, new Long(volOnSecondary.getSize()));
}
// delete all cache entries for this volume
List<VolumeInfo> cacheVols = volFactory.listVolumeOnCache(volume.getId());

View File

@ -306,8 +306,7 @@ public class SnapshotManagerImpl extends MutualExclusiveIdsManagerBase implement
boolean result = snapshotStrategy.revertSnapshot(snapshotInfo);
if (result) {
// update volume size and primary storage count
_resourceLimitMgr.decrementResourceCount(snapshot.getAccountId(), ResourceType.primary_storage,
new Long(volume.getSize() - snapshot.getSize()));
_resourceLimitMgr.decrementResourceCount(snapshot.getAccountId(), ResourceType.primary_storage, new Long(volume.getSize() - snapshot.getSize()));
volume.setSize(snapshot.getSize());
_volsDao.update(volume.getId(), volume);
return snapshotInfo;

View File

@ -2139,7 +2139,7 @@ public class UserVmManagerImpl extends ManagerBase implements UserVmManager, Vir
// Update Resource count
if (vm.getAccountId() != Account.ACCOUNT_ID_SYSTEM && !rootVol.isEmpty()) {
_resourceLimitMgr.decrementResourceCount(vm.getAccountId(), ResourceType.volume);
_resourceLimitMgr.recalculateResourceCount(vm.getAccountId(), vm.getDomainId(), ResourceType.primary_storage.getOrdinal());
_resourceLimitMgr.decrementResourceCount(vm.getAccountId(), ResourceType.primary_storage, new Long(rootVol.get(0).getSize()));
}
// Only if vm is not expunged already, cleanup it's resources
@ -5568,9 +5568,9 @@ public class UserVmManagerImpl extends ManagerBase implements UserVmManager, Vir
@Override
public void doInTransactionWithoutResult(TransactionStatus status) {
//generate destroy vm event for usage
UsageEventUtils.publishUsageEvent(EventTypes.EVENT_VM_DESTROY, vm.getAccountId(), vm.getDataCenterId(), vm.getId(), vm.getHostName(), vm.getServiceOfferingId(),
vm.getTemplateId(), vm.getHypervisorType().toString(), VirtualMachine.class.getName(), vm.getUuid(), vm.isDisplayVm());
UsageEventUtils.publishUsageEvent(EventTypes.EVENT_VM_DESTROY, vm.getAccountId(), vm.getDataCenterId(),
vm.getId(), vm.getHostName(), vm.getServiceOfferingId(), vm.getTemplateId(),
vm.getHypervisorType().toString(), VirtualMachine.class.getName(), vm.getUuid(), vm.isDisplayVm());
// update resource counts for old account
resourceCountDecrement(oldAccount.getAccountId(), vm.isDisplayVm(), new Long(offering.getCpu()), new Long(offering.getRamSize()));
@ -5591,15 +5591,17 @@ public class UserVmManagerImpl extends ManagerBase implements UserVmManager, Vir
_resourceLimitMgr.incrementResourceCount(newAccount.getAccountId(), ResourceType.volume);
_resourceLimitMgr.incrementResourceCount(newAccount.getAccountId(), ResourceType.primary_storage, new Long(volume.getSize()));
UsageEventUtils.publishUsageEvent(EventTypes.EVENT_VOLUME_CREATE, volume.getAccountId(), volume.getDataCenterId(), volume.getId(), volume.getName(),
volume.getDiskOfferingId(), volume.getTemplateId(), volume.getSize(), Volume.class.getName(), volume.getUuid(), volume.isDisplayVolume());
volume.getDiskOfferingId(), volume.getTemplateId(), volume.getSize(), Volume.class.getName(),
volume.getUuid(), volume.isDisplayVolume());
}
//update resource count of new account
resourceCountIncrement(newAccount.getAccountId(), vm.isDisplayVm(), new Long(offering.getCpu()), new Long(offering.getRamSize()));
//generate usage events to account for this change
UsageEventUtils.publishUsageEvent(EventTypes.EVENT_VM_CREATE, vm.getAccountId(), vm.getDataCenterId(), vm.getId(), vm.getHostName(), vm.getServiceOfferingId(),
vm.getTemplateId(), vm.getHypervisorType().toString(), VirtualMachine.class.getName(), vm.getUuid(), vm.isDisplayVm());
UsageEventUtils.publishUsageEvent(EventTypes.EVENT_VM_CREATE, vm.getAccountId(), vm.getDataCenterId(), vm.getId(),
vm.getHostName(), vm.getServiceOfferingId(), vm.getTemplateId(), vm.getHypervisorType().toString(),
VirtualMachine.class.getName(), vm.getUuid(), vm.isDisplayVm());
}
});