diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java index f901fd97ca76..c069416cca54 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java @@ -55,7 +55,7 @@ public Answer execute(final CheckOnHostCommand command, final LibvirtComputingRe if (hasHeartBeat) { return new CheckOnHostAnswer(command, true, "Heart is beating"); } else { - return new CheckOnHostAnswer(command, "Heart is not beating"); + return new CheckOnHostAnswer(command, false, "Heart is not beating"); } } catch (final InterruptedException e) { return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: InterruptedException"); diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java index f0b5cfc337de..352c1b14724e 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java @@ -33,6 +33,7 @@ import org.apache.cloudstack.ha.provider.HARecoveryException; import org.apache.cloudstack.ha.provider.host.HAAbstractHostProvider; import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState; import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService; import org.joda.time.DateTime; @@ -85,18 +86,57 @@ public boolean recover(Host r) throws HARecoveryException { @Override public boolean fence(Host r) throws HAFenceException { + if (!outOfBandManagementService.isOutOfBandManagementEnabled(r)) { + logger.warn("Out-of-band management is not enabled/configured for host {}; cannot fence it.", r); + return false; + } + // Fencing must guarantee the host is powered off, and the only reliable signal for that is + // the actual chassis power state - not the return code of the power-off command. An already-off + // (or just-turned-off) host can make the power-off command report an error/conflict even though + // the host is down; conversely, an unreachable BMC must NOT be treated as a successful fence, to + // avoid split-brain. Therefore: (1) if already off, consider it fenced; (2) otherwise issue a + // best-effort power-off; (3) declare the fence successful only if the power state is confirmed Off. try { - if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){ - final OutOfBandManagementResponse resp = outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null); - return resp.getSuccess(); - } else { - logger.warn("OOBM fence operation failed for this host {}", r); - return false; + if (isHostPoweredOff(r)) { + logger.info("Host {} is already powered off; considering it fenced.", r); + return true; } - } catch (Exception e){ - logger.warn("OOBM service is not configured or enabled for this host {} error is {}", r, e.getMessage()); - throw new HAFenceException(String.format("OBM service is not configured or enabled for this host %s", r.getName()), e); + + try { + outOfBandManagementService.executePowerOperation(r, PowerOperation.OFF, null); + } catch (Exception e) { + // The power-off may legitimately fail (e.g. the chassis is already off or the command + // conflicts with the current power state). Do not fail here - verify the actual power + // state below instead of trusting the command result. + logger.warn("Out-of-band power-off command for host {} did not complete successfully ({}); verifying the actual power state.", r, e.getMessage()); + } + + if (isHostPoweredOff(r)) { + logger.info("Confirmed host {} is powered off; fencing successful.", r); + return true; + } + + logger.warn("Could not confirm host {} is powered off after the fence power-off; fencing will be retried.", r); + return false; + } catch (Exception e) { + logger.warn("Out-of-band fence operation failed for host {}: {}", r, e.getMessage()); + throw new HAFenceException(String.format("Out-of-band fence operation failed for host %s", r.getName()), e); + } + } + + /** + * Returns {@code true} only when the host's chassis power is positively confirmed to be Off via + * out-of-band management. Any failure to determine the power state (e.g. an unreachable BMC) returns + * {@code false}, so that a host whose state cannot be confirmed is never treated as fenced. + */ + protected boolean isHostPoweredOff(Host r) { + try { + final OutOfBandManagementResponse statusResponse = outOfBandManagementService.executePowerOperation(r, PowerOperation.STATUS, null); + return statusResponse != null && PowerState.Off.equals(statusResponse.getPowerState()); + } catch (Exception e) { + logger.warn("Unable to determine the out-of-band power state of host {}: {}", r, e.getMessage()); + return false; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index af7441c4fd29..96fd42d4d544 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -109,6 +109,7 @@ public Status getHostAgentStatus(Host host) { } Status hostStatusFromNeighbour = checkHostStatusWithNeighbourHosts(host); + logger.debug("{} status reported from itself: {} and neighbor: {}", host.toString(), hostStatusFromItself, hostStatusFromNeighbour); Status hostStatus = hostStatusFromItself; if (hostStatusFromNeighbour == Status.Up && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) { hostStatus = Status.Disconnected; diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java index 662b09a30448..aaebd8c7aebc 100644 --- a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java +++ b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java @@ -61,6 +61,7 @@ import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; +import com.cloud.agent.api.CheckOnHostAnswer; import com.google.gson.JsonObject; import com.google.gson.JsonParser; @@ -3130,7 +3131,9 @@ public void testCheckOnHostCommand() { assertNotNull(wrapper); final Answer answer = wrapper.execute(command, libvirtComputingResourceMock); - assertFalse(answer.getResult()); + assertTrue(answer.getResult()); + assertTrue(answer instanceof CheckOnHostAnswer); + assertFalse(((CheckOnHostAnswer)answer).isAlive()); verify(libvirtComputingResourceMock, times(1)).getMonitor(); } diff --git a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java index a94fb01475a3..8b7cfc854c08 100644 --- a/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java +++ b/plugins/hypervisors/kvm/src/test/java/org/apache/cloudstack/kvm/ha/KVMHostHATest.java @@ -20,10 +20,20 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isNull; import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import org.apache.cloudstack.api.response.OutOfBandManagementResponse; import org.apache.cloudstack.ha.provider.HACheckerException; +import org.apache.cloudstack.ha.provider.HAFenceException; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerOperation; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagement.PowerState; +import org.apache.cloudstack.outofbandmanagement.OutOfBandManagementService; import org.joda.time.DateTime; import org.junit.Before; import org.junit.Test; @@ -34,6 +44,7 @@ import com.cloud.exception.StorageUnavailableException; import com.cloud.host.Host; import com.cloud.hypervisor.Hypervisor.HypervisorType; +import com.cloud.utils.exception.CloudRuntimeException; @RunWith(MockitoJUnitRunner.class) public class KVMHostHATest { @@ -42,12 +53,21 @@ public class KVMHostHATest { private Host host; @Mock private KVMHostActivityChecker kvmHostActivityChecker; + @Mock + private OutOfBandManagementService outOfBandManagementService; private KVMHAProvider kvmHAProvider; @Before public void setup() { kvmHAProvider = new KVMHAProvider(); kvmHAProvider.hostActivityChecker = kvmHostActivityChecker; + kvmHAProvider.outOfBandManagementService = outOfBandManagementService; + } + + private OutOfBandManagementResponse powerStatusResponse(PowerState state) { + OutOfBandManagementResponse response = mock(OutOfBandManagementResponse.class); + lenient().when(response.getPowerState()).thenReturn(state); + return response; } @Test @@ -80,4 +100,60 @@ public void testHostActivityForDownHost() throws HACheckerException, StorageUnav assertFalse(kvmHAProvider.hasActivity(host, dt)); } + @Test + public void testFenceWhenOutOfBandManagementNotEnabled() throws HAFenceException { + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(false); + assertFalse(kvmHAProvider.fence(host)); + verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()); + } + + @Test + public void testFenceWhenHostAlreadyPoweredOff() throws HAFenceException { + OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off); + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())).thenReturn(offStatus); + assertTrue(kvmHAProvider.fence(host)); + // The host is already off, so no power-off command should be issued. + verify(outOfBandManagementService, never()).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()); + } + + @Test + public void testFenceConfirmedOffAfterPowerOff() throws HAFenceException { + OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On); + OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off); + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + // First STATUS (pre-check) returns On, second STATUS (post power-off) returns Off. + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())) + .thenReturn(onStatus, offStatus); + assertTrue(kvmHAProvider.fence(host)); + verify(outOfBandManagementService).executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull()); + } + + @Test + public void testFencePowerOffCommandFailsButHostConfirmedOff() throws HAFenceException { + // Regression: the power-off command fails (e.g. the chassis is already off and the BMC returns + // HTTP 409), but the host is actually off. Fencing must succeed because the confirmed power state - + // not the power-off command's result - is authoritative. + OutOfBandManagementResponse onStatus = powerStatusResponse(PowerState.On); + OutOfBandManagementResponse offStatus = powerStatusResponse(PowerState.Off); + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())) + .thenReturn(onStatus, offStatus); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull())) + .thenThrow(new CloudRuntimeException("power-off failed: HTTP 409")); + assertTrue(kvmHAProvider.fence(host)); + } + + @Test + public void testFenceFailsWhenPowerStateCannotBeConfirmedOff() throws HAFenceException { + // The host cannot be confirmed off (e.g. an unreachable BMC). Fencing must NOT succeed, to avoid + // restarting VMs while the host may still be running (split-brain). + when(outOfBandManagementService.isOutOfBandManagementEnabled(host)).thenReturn(true); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.STATUS), isNull())) + .thenThrow(new CloudRuntimeException("BMC unreachable")); + when(outOfBandManagementService.executePowerOperation(eq(host), eq(PowerOperation.OFF), isNull())) + .thenThrow(new CloudRuntimeException("BMC unreachable")); + assertFalse(kvmHAProvider.fence(host)); + } + } diff --git a/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java b/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java index 831e222200ab..7b9f258920c5 100644 --- a/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java +++ b/plugins/hypervisors/vmware/src/main/java/com/cloud/hypervisor/vmware/resource/VmwareResource.java @@ -5807,7 +5807,7 @@ protected Answer execute(PingTestCommand cmd) { } protected Answer execute(CheckOnHostCommand cmd) { - return new CheckOnHostAnswer(cmd, null, "Not Implmeneted"); + return new CheckOnHostAnswer(cmd, null, "Not Implemented"); } protected Answer execute(ModifySshKeysCommand cmd) { diff --git a/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java b/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java index 5535b7e84c87..9a675c33e74a 100644 --- a/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java +++ b/plugins/hypervisors/xenserver/src/main/java/com/cloud/hypervisor/xenserver/resource/wrapper/xenbase/CitrixCheckOnHostCommandWrapper.java @@ -31,6 +31,6 @@ public final class CitrixCheckOnHostCommandWrapper extends CommandWrapper