Abort cleanly when deployment fails
- Introduce (first version of) state machine in NebulousApp. - Make various app state maps read-only so we never have a half-deployed state; instead, we update the app object atomically at the end. - Add deleteCluster endpoint. - Delete cluster when any deployment step fails. Change-Id: Ib05934035a808373e001937d614f3191aa926f1b
This commit is contained in:
parent
4b6d479ae5
commit
ab6375304d
@ -94,6 +94,8 @@ public class ExnConnector {
|
||||
public final SyncedPublisher scaleOut;
|
||||
/** The scaleIn endpoint. */
|
||||
public final SyncedPublisher scaleIn;
|
||||
/** The deleteCluster endpoint. */
|
||||
public final SyncedPublisher deleteCluster;
|
||||
|
||||
/**
|
||||
* Create a connection to ActiveMQ via the exn middleware, and set up the
|
||||
@ -119,6 +121,7 @@ public class ExnConnector {
|
||||
deployApplication = new SyncedPublisher("deployApplication", "eu.nebulouscloud.exn.sal.cluster.deployapplication", true, true);
|
||||
scaleOut = new SyncedPublisher("scaleOut", "eu.nebulouscloud.exn.sal.cluster.scaleout", true, true);
|
||||
scaleIn = new SyncedPublisher("scaleIn", "eu.nebulouscloud.exn.sal.cluster.scalein", true, true);
|
||||
deleteCluster = new SyncedPublisher("deployCluster", "eu.nebulouscloud.exn.sal.cluster.delete", true, true);
|
||||
|
||||
conn = new Connector("optimiser_controller",
|
||||
callback,
|
||||
@ -131,7 +134,8 @@ public class ExnConnector {
|
||||
deployCluster,
|
||||
deployApplication,
|
||||
scaleOut,
|
||||
scaleIn),
|
||||
scaleIn,
|
||||
deleteCluster),
|
||||
List.of(
|
||||
new Consumer("ui_app_messages", app_creation_channel,
|
||||
new AppCreationMessageHandler(), true, true),
|
||||
@ -481,7 +485,10 @@ public class ExnConnector {
|
||||
}
|
||||
|
||||
/**
|
||||
* Deploy a cluster created by {@link #defineCluster}.
|
||||
* Deploy a cluster created by {@link #defineCluster}. Note that the call
|
||||
* will return before the cluster is ready, i.e., {@link #getCluster} must
|
||||
* be checked before trying to call {@link #labelNodes} or {@link
|
||||
* #deployApplication}.
|
||||
*
|
||||
* @param appID The application's id, used for logging only.
|
||||
* @param clusterName The name of the cluster.
|
||||
@ -566,7 +573,6 @@ public class ExnConnector {
|
||||
* @return true if the call was successful, false otherwise.
|
||||
*/
|
||||
public boolean scaleIn(String appID, List<String> superfluousNodes) {
|
||||
// NOTE: not yet defined in
|
||||
// https://openproject.nebulouscloud.eu/projects/nebulous-collaboration-hub/wiki/deployment-manager-sal-1#specification-of-endpoints-being-developed
|
||||
ArrayNode body = mapper.createArrayNode();
|
||||
superfluousNodes.forEach(nodeName -> body.add(nodeName));
|
||||
@ -584,5 +590,20 @@ public class ExnConnector {
|
||||
return payload.asBoolean();
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a cluster created by {@link #defineCluster}.
|
||||
*
|
||||
* @param appID The application's id, used for logging only.
|
||||
* @param clusterName The name of the cluster.
|
||||
* @return true if the cluster was successfully deleted, false otherwise.
|
||||
*/
|
||||
public boolean deleteCluster(String appID, String clusterName) {
|
||||
// https://openproject.nebulouscloud.eu/projects/nebulous-collaboration-hub/wiki/deployment-manager-sal-1#specification-of-endpoints-being-developed
|
||||
Map<String, Object> msg = Map.of("metaData",
|
||||
Map.of("user", "admin", "clusterName", clusterName));
|
||||
Map<String, Object> response = deleteCluster.sendSync(msg, appID, null, false);
|
||||
JsonNode payload = extractPayloadFromExnResponse(response, appID, "deleteCluster");
|
||||
return payload.asBoolean();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
|
||||
import eu.nebulouscloud.exn.core.Publisher;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.Synchronized;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import static net.logstash.logback.argument.StructuredArguments.keyValue;
|
||||
|
||||
@ -60,6 +61,36 @@ public class NebulousApp {
|
||||
*/
|
||||
@Getter private String clusterName;
|
||||
|
||||
/**
|
||||
* The application status.
|
||||
*
|
||||
* <p>NEW: The application has been created from the GUI and is waiting
|
||||
* for the performance indicators.
|
||||
*
|
||||
* <p>READY: The application is ready for deployment.
|
||||
*
|
||||
* <p>DEPLOYING: The application is being deployed or redeployed.
|
||||
*
|
||||
* <p>SOLVER_WAITING: The application is deployed, we're waiting for the
|
||||
* solver to be ready so we can send AMPL and performance indicators.
|
||||
*
|
||||
* <p>RUNNING: The application is running, and under redeployment.
|
||||
*
|
||||
* <p>FAILED: The application is in an invalid state: one or more messages
|
||||
* could not be parsed, or deployment or redeployment failed.
|
||||
*/
|
||||
public enum State {
|
||||
NEW,
|
||||
READY,
|
||||
DEPLOYING,
|
||||
SOLVER_WAITING,
|
||||
RUNNING,
|
||||
FAILED;
|
||||
}
|
||||
|
||||
@Getter
|
||||
private State state;
|
||||
|
||||
// ----------------------------------------
|
||||
// App message parsing stuff
|
||||
|
||||
@ -114,37 +145,38 @@ public class NebulousApp {
|
||||
* to 1, each subsequent redeployment increases by 1. This value is used
|
||||
* to name node instances generated during that deployment.
|
||||
*/
|
||||
@Getter @Setter
|
||||
@Getter
|
||||
private int deployGeneration = 0;
|
||||
|
||||
/**
|
||||
* Map of component name to node name(s) deployed for that component.
|
||||
* Component names are defined in the KubeVela file. We assume that
|
||||
* component names stay constant during redeployment, i.e., once an
|
||||
* application is deployed, its KubeVela file will not change.
|
||||
* Unmodifiable map of component name to node name(s) deployed for that
|
||||
* component. Component names are defined in the KubeVela file. We
|
||||
* assume that component names stay constant during redeployment, i.e.,
|
||||
* once an application is deployed, its KubeVela file will not change.
|
||||
*
|
||||
* Note that this map does not include the master node, since this is not
|
||||
* specified in KubeVela.
|
||||
*/
|
||||
@Getter
|
||||
private Map<String, Set<String>> componentNodeNames = new HashMap<>();
|
||||
private Map<String, Set<String>> componentNodeNames = Map.of();
|
||||
/**
|
||||
* Map from node name to deployed edge or BYON node candidate. We keep
|
||||
* track of assigned edge candidates, since we do not want to
|
||||
* doubly-assign edge nodes. We also store the node name, so we can
|
||||
* "free" the edge candidate when the current component gets redeployed
|
||||
* and lets go of its edge node. (We do not track cloud node candidates
|
||||
* since these can be instantiated multiple times.)
|
||||
* Unmodifiable map from node name to deployed edge or BYON node
|
||||
* candidate. We keep track of assigned edge candidates, since we do not
|
||||
* want to doubly-assign edge nodes. We also store the node name, so we
|
||||
* can "free" the edge candidate when the current component gets
|
||||
* redeployed and lets go of its edge node. (We do not track cloud node
|
||||
* candidates since these can be instantiated multiple times.)
|
||||
*/
|
||||
@Getter
|
||||
private Map<String, NodeCandidate> nodeEdgeCandidates = new HashMap<>();
|
||||
/** Map of component name to its requirements, as currently deployed.
|
||||
* Each replica of a component has identical requirements. */
|
||||
@Getter @Setter
|
||||
private Map<String, List<Requirement>> componentRequirements = new HashMap<>();
|
||||
/** Map of component name to its replica count, as currently deployed. */
|
||||
@Getter @Setter
|
||||
private Map<String, Integer> componentReplicaCounts = new HashMap<>();
|
||||
private Map<String, NodeCandidate> nodeEdgeCandidates = Map.of();
|
||||
/** Unmodifiable map of component name to its requirements, as currently
|
||||
* deployed. Each replica of a component has identical requirements. */
|
||||
@Getter
|
||||
private Map<String, List<Requirement>> componentRequirements = Map.of();
|
||||
/** Unmodifiable map of component name to its replica count, as currently
|
||||
* deployed. */
|
||||
@Getter
|
||||
private Map<String, Integer> componentReplicaCounts = Map.of();
|
||||
|
||||
/** When an app gets deployed, this is where we send the AMPL file */
|
||||
private Publisher ampl_message_channel;
|
||||
@ -153,15 +185,15 @@ public class NebulousApp {
|
||||
// private boolean deployed = false;
|
||||
|
||||
/** The KubeVela as it was most recently sent to the app's controller. */
|
||||
@Getter @Setter
|
||||
@Getter
|
||||
private JsonNode deployedKubevela;
|
||||
/** For each KubeVela component, the number of deployed nodes. All nodes
|
||||
* will be identical wrt machine type etc. */
|
||||
@Getter @Setter
|
||||
private Map<String, Integer> deployedNodeCounts;
|
||||
/** For each KubeVela component, the requirements for its node(s). */
|
||||
@Getter @Setter
|
||||
private Map<String, List<Requirement>> deployedNodeRequirements;
|
||||
* will be identical wrt machine type etc. Unmodifiable map. */
|
||||
@Getter
|
||||
private Map<String, Integer> deployedNodeCounts = Map.of();
|
||||
/** For each KubeVela component, the requirements for its node(s). Unmodifiable map. */
|
||||
@Getter
|
||||
private Map<String, List<Requirement>> deployedNodeRequirements = Map.of();
|
||||
|
||||
/**
|
||||
* The EXN connector for this class. At the moment all apps share the
|
||||
@ -183,6 +215,7 @@ public class NebulousApp {
|
||||
public NebulousApp(JsonNode app_message, ObjectNode kubevela, ExnConnector exnConnector) {
|
||||
this.UUID = app_message.at(uuid_path).textValue();
|
||||
this.name = app_message.at(name_path).textValue();
|
||||
this.state = State.READY;
|
||||
this.clusterName = NebulousApps.calculateUniqueClusterName(this.UUID);
|
||||
this.originalAppMessage = app_message;
|
||||
this.originalKubevela = kubevela;
|
||||
@ -286,6 +319,46 @@ public class NebulousApp {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the state from READY to DEPLOYING, and increment the generation.
|
||||
*
|
||||
* @return false if deployment could not be started, true otherwise.
|
||||
*/
|
||||
@Synchronized
|
||||
public boolean setStateDeploying() {
|
||||
if (state != State.READY) {
|
||||
return false;
|
||||
} else {
|
||||
state = State.DEPLOYING;
|
||||
deployGeneration++;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
/** Set state from DEPLOYING to RUNNING and update app cluster information.
|
||||
* @return false if not in state deploying, otherwise true. */
|
||||
@Synchronized
|
||||
public boolean setStateDeploymentFinished(Map<String, List<Requirement>> componentRequirements, Map<String, Integer> nodeCounts, Map<String, Set<String>> componentNodeNames, Map<String, NodeCandidate> nodeEdgeCandidates, JsonNode deployedKubevela) {
|
||||
if (state != State.DEPLOYING) {
|
||||
return false;
|
||||
} else {
|
||||
// We keep all state read-only so we cannot modify the app object
|
||||
// before we know deployment is successful
|
||||
this.componentRequirements = Map.copyOf(componentRequirements);
|
||||
this.componentReplicaCounts = Map.copyOf(nodeCounts);
|
||||
this.componentNodeNames = Map.copyOf(componentNodeNames);
|
||||
this.deployedKubevela = deployedKubevela;
|
||||
this.nodeEdgeCandidates = Map.copyOf(nodeEdgeCandidates);
|
||||
state = State.RUNNING;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Set state unconditionally to FAILED. No more state changes will be
|
||||
* possible once the state is set to FAILED. */
|
||||
public void setStateFailed() {
|
||||
state = State.FAILED;
|
||||
}
|
||||
|
||||
/** Utility function to parse a KubeVela string. Can be used from jshell. */
|
||||
public static JsonNode readKubevelaString(String kubevela) throws JsonMappingException, JsonProcessingException {
|
||||
return yamlMapper.readTree(kubevela);
|
||||
@ -391,6 +464,11 @@ public class NebulousApp {
|
||||
|
||||
/**
|
||||
* Calculate AMPL file and send it off to the solver.
|
||||
*
|
||||
* <p> TODO: this should be done once from a message handler that listens
|
||||
* for an incoming "solver ready" message
|
||||
*
|
||||
* <p> TODO: also send performance indicators to solver here
|
||||
*/
|
||||
public void sendAMPL() {
|
||||
String ampl = AMPLGenerator.generateAMPL(this);
|
||||
|
@ -21,8 +21,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.node.ArrayNode;
|
||||
import com.fasterxml.jackson.databind.node.ObjectNode;
|
||||
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
|
||||
import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import static net.logstash.logback.argument.StructuredArguments.keyValue;
|
||||
|
||||
@ -145,6 +143,36 @@ public class NebulousAppDeployer {
|
||||
.allMatch(node -> node.get("state").asText().equals("Finished"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait until all nodes in cluster are in state "Finished".
|
||||
*
|
||||
* <p>Note: Cluster deployment includes provisioning and booting VMs,
|
||||
* installing various software packages, bringing up a Kubernetes cluster
|
||||
* and installing the NebulOuS runtime. This can take some minutes.
|
||||
*/
|
||||
private static boolean waitForClusterDeploymentFinished(ExnConnector conn, String clusterName, String appUUID) {
|
||||
// TODO: find out what state node(s) or the whole cluster are in when
|
||||
// cluster start fails, and return false in that case.
|
||||
JsonNode clusterState = conn.getCluster(clusterName);
|
||||
while (clusterState == null || !isClusterDeploymentFinished(clusterState)) {
|
||||
log.info("Waiting for cluster deployment to finish...",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName),
|
||||
keyValue("clusterState", clusterState));
|
||||
try {
|
||||
Thread.sleep(10000);
|
||||
} catch (InterruptedException e1) {
|
||||
// ignore
|
||||
}
|
||||
// TODO: distinguish between clusterState==null because SAL hasn't
|
||||
// set up its datastructures yet, and clusterState==null because
|
||||
// the call to getCluster failed. In the latter case we want to
|
||||
// abort (because someone has deleted the cluster), in the former
|
||||
// case we want to continue.
|
||||
clusterState = conn.getCluster(clusterName);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a KubeVela file, extract node requirements, create the job, start
|
||||
* its nodes and submit KubeVela.
|
||||
@ -152,45 +180,52 @@ public class NebulousAppDeployer {
|
||||
* <p>NOTE: this method modifies the NebulousApp object state, storing
|
||||
* various facts about the deployed cluster.
|
||||
*
|
||||
* <p>NOTE: this method is under reconstruction, pending the new
|
||||
* endpoints.
|
||||
*
|
||||
* @param app The NebulOuS app object.
|
||||
* @param kubevela the KubeVela file to deploy.
|
||||
*/
|
||||
public static void deployApplication(NebulousApp app, JsonNode kubevela) {
|
||||
String appUUID = app.getUUID();
|
||||
String clusterName = app.getClusterName();
|
||||
if (!app.setStateDeploying()) {
|
||||
// TODO: wait until we got the performance indicators from Marta
|
||||
log.error("Trying to deploy app that is in state {} (should be READY), aborting deployment",
|
||||
app.getState().name(),
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
// The application name is typed in by the user, and is used
|
||||
// internally by SAL as an unquoted filename in a generated shell
|
||||
// script. It shouldn't be this way but it is what it is.
|
||||
String safeAppName = app.getName().replaceAll("[^a-zA-Z0-9-_]", "_");
|
||||
ExnConnector conn = app.getExnConnector();
|
||||
log.info("Starting initial deployment for application", keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
log.info("Starting initial deployment for application",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
|
||||
int deployGeneration = app.getDeployGeneration() + 1;
|
||||
app.setDeployGeneration(deployGeneration);
|
||||
|
||||
// The overall flow:
|
||||
//
|
||||
// 1. Extract node requirements and node counts from the KubeVela
|
||||
// definition.
|
||||
// 2. Ask resource broker for node candidates for all components and the
|
||||
// controller.
|
||||
// 3. Select node candidates, making sure to only select edge nodes
|
||||
// once.
|
||||
// 4. Create a SAL cluster.
|
||||
// 5. Deploy the SAL cluster.
|
||||
// 6. Add node affinity traits to the KubeVela file.
|
||||
// 7. Deploy the SAL application.
|
||||
// 8. Store cluster state (deployed KubeVela file, etc.) in
|
||||
// NebulousApp object.
|
||||
// - Extract node requirements and node counts from the KubeVela
|
||||
// definition.
|
||||
// - Rewrite KubeVela: remove performance requirements, add affinity
|
||||
// traits
|
||||
// - Ask resource broker for node candidates for all components and the
|
||||
// controller.
|
||||
// - Select node candidates, making sure to only select edge nodes
|
||||
// once.
|
||||
// - Create a SAL cluster.
|
||||
// - Deploy the SAL cluster.
|
||||
// - Add node affinity traits to the KubeVela file.
|
||||
// - Deploy the SAL application.
|
||||
// - Store cluster state (deployed KubeVela file, etc.) in
|
||||
// NebulousApp object.
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 1. Extract node requirements
|
||||
// Extract node requirements
|
||||
Map<String, List<Requirement>> componentRequirements = KubevelaAnalyzer.getClampedRequirements(kubevela);
|
||||
Map<String, Integer> nodeCounts = KubevelaAnalyzer.getNodeCount(kubevela);
|
||||
List<Requirement> controllerRequirements = getControllerRequirements(appUUID);
|
||||
// HACK: do this only when cloud id = nrec
|
||||
componentRequirements.forEach(
|
||||
(k, reqs) -> reqs.add(new AttributeRequirement("location", "name", RequirementOperator.EQ, "bgo")));
|
||||
|
||||
@ -198,49 +233,68 @@ public class NebulousAppDeployer {
|
||||
Main.logFile("component-counts-" + appUUID + ".txt", nodeCounts);
|
||||
Main.logFile("controller-requirements-" + appUUID + ".txt", controllerRequirements);
|
||||
|
||||
// ----------------------------------------
|
||||
// 2. Find node candidates
|
||||
// ------------------------------------------------------------
|
||||
// Rewrite KubeVela
|
||||
JsonNode rewritten = createDeploymentKubevela(kubevela);
|
||||
String rewritten_kubevela = "---\n# Did not manage to create rewritten KubeVela";
|
||||
try {
|
||||
rewritten_kubevela = yamlMapper.writeValueAsString(rewritten);
|
||||
} catch (JsonProcessingException e) {
|
||||
log.error("Failed to convert KubeVela to YAML; this should never happen",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName), e);
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
Main.logFile("rewritten-kubevela-" + appUUID + ".yaml", rewritten_kubevela);
|
||||
|
||||
// TODO: filter by app resources (check enabled: true in resources array)
|
||||
// ----------------------------------------
|
||||
// Find node candidates
|
||||
|
||||
// TODO: filter by app resources / cloud? (check enabled: true in resources array)
|
||||
List<NodeCandidate> controllerCandidates = conn.findNodeCandidates(controllerRequirements, appUUID);
|
||||
if (controllerCandidates.isEmpty()) {
|
||||
log.error("Could not find node candidates for requirements: {}",
|
||||
log.error("Could not find node candidates for requirements: {}, aborting deployment",
|
||||
controllerRequirements, keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
// Continue here while we don't really deploy
|
||||
// return;
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
Map<String, List<NodeCandidate>> componentCandidates = new HashMap<>();
|
||||
for (Map.Entry<String, List<Requirement>> e : componentRequirements.entrySet()) {
|
||||
String nodeName = e.getKey();
|
||||
List<Requirement> requirements = e.getValue();
|
||||
// TODO: filter by app resources (check enabled: true in resources array)
|
||||
// TODO: filter by app resources / cloud? (check enabled: true in resources array)
|
||||
List<NodeCandidate> candidates = conn.findNodeCandidates(requirements, appUUID);
|
||||
if (candidates.isEmpty()) {
|
||||
log.error("Could not find node candidates for for node {}, requirements: {}", nodeName, requirements,
|
||||
log.error("Could not find node candidates for for node {}, requirements: {}, aborting deployment", nodeName, requirements,
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
// Continue here while we don't really deploy
|
||||
// return;
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
componentCandidates.put(nodeName, candidates);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 3. Select node candidates
|
||||
// Select node candidates
|
||||
|
||||
Map<String, NodeCandidate> nodeEdgeCandidates = new HashMap<>(app.getNodeEdgeCandidates());
|
||||
|
||||
// Controller node
|
||||
log.info("Deciding on controller node candidate", keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
String masterNodeName = "n" + clusterName.toLowerCase() + "-masternode"; // safe because all component node names end with a number
|
||||
// Take care to only use lowercase, numbers, starting with letter
|
||||
String masterNodeName = "m" + clusterName.toLowerCase() + "-master";
|
||||
NodeCandidate masterNodeCandidate = null;
|
||||
if (controllerCandidates.size() > 0) {
|
||||
masterNodeCandidate = controllerCandidates.get(0);
|
||||
if (Set.of(NodeCandidateTypeEnum.BYON, NodeCandidateTypeEnum.EDGE)
|
||||
.contains(masterNodeCandidate.getNodeCandidateType())) {
|
||||
// Mark this candidate as already chosen
|
||||
app.getNodeEdgeCandidates().put(masterNodeName, masterNodeCandidate);
|
||||
nodeEdgeCandidates.put(masterNodeName, masterNodeCandidate);
|
||||
}
|
||||
} else {
|
||||
log.error("Empty node candidate list for controller, continuing without creating node",
|
||||
log.error("Empty node candidate list for controller, aborting deployment",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
|
||||
// Component nodes
|
||||
@ -255,47 +309,57 @@ public class NebulousAppDeployer {
|
||||
// ExnConnector.createCluster
|
||||
// - Each node name and its label (nodeLabels), for
|
||||
// ExnConnector.labelNodes
|
||||
Map<String, Set<String>> componentNodeNames = new HashMap<>();
|
||||
for (Map.Entry<String, List<Requirement>> e : componentRequirements.entrySet()) {
|
||||
String componentName = e.getKey();
|
||||
int numberOfNodes = nodeCounts.get(componentName);
|
||||
Set<String> nodeNames = new HashSet<>();
|
||||
List<NodeCandidate> candidates = componentCandidates.get(componentName);
|
||||
if (candidates.size() == 0) {
|
||||
log.error("Empty node candidate list for component {}, continuing without creating node", componentName,
|
||||
log.error("Empty node candidate list for component {}, aborting deployment", componentName,
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
continue;
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
for (int nodeNumber = 1; nodeNumber <= numberOfNodes; nodeNumber++) {
|
||||
String nodeName = createNodeName(clusterName, componentName, deployGeneration, nodeNumber);
|
||||
String nodeName = createNodeName(clusterName, componentName, app.getDeployGeneration(), nodeNumber);
|
||||
NodeCandidate candidate = candidates.stream()
|
||||
.filter(each -> !app.getNodeEdgeCandidates().values().contains(each))
|
||||
.filter(each -> !nodeEdgeCandidates.values().contains(each))
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
if (candidate == null) {
|
||||
log.error("No available node candidate for node {} of component {}", nodeNumber, componentName,
|
||||
log.error("No available node candidate for node {} of component {}, aborting deployment", nodeNumber, componentName,
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
continue;
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
if (Set.of(NodeCandidateTypeEnum.BYON, NodeCandidateTypeEnum.EDGE).contains(candidate.getNodeCandidateType())) {
|
||||
app.getNodeEdgeCandidates().put(nodeName, candidate);
|
||||
nodeEdgeCandidates.put(nodeName, candidate);
|
||||
}
|
||||
clusterNodes.put(nodeName, candidate);
|
||||
nodeLabels.addObject().put(nodeName, "nebulouscloud.eu/" + componentName + "=yes");
|
||||
nodeNames.add(nodeName);
|
||||
}
|
||||
app.getComponentNodeNames().put(componentName, nodeNames);
|
||||
// XXX TODO do not directly mutate this value
|
||||
componentNodeNames.put(componentName, nodeNames);
|
||||
}
|
||||
Main.logFile("nodenames-" + appUUID + ".txt", app.getComponentNodeNames());
|
||||
Main.logFile("nodenames-" + appUUID + ".txt", componentNodeNames);
|
||||
Main.logFile("master-nodecandidate-" + appUUID + ".txt", masterNodeCandidate);
|
||||
Main.logFile("component-nodecandidates-" + appUUID + ".txt", clusterNodes);
|
||||
try {
|
||||
Main.logFile("component-labels-" + appUUID + ".txt", mapper.writeValueAsString(nodeLabels));
|
||||
} catch (JsonProcessingException e1) {
|
||||
// ignore; the labelNodes method will report the same error later
|
||||
log.error("Internal error: could not convert node labels to string (this should never happen), aborting deployment",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: send performance indicators (for monitoring system, which
|
||||
// needs it before cluster creation)
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 4. Create cluster
|
||||
// Create cluster
|
||||
|
||||
ObjectNode cluster = mapper.createObjectNode();
|
||||
cluster.put("name", clusterName)
|
||||
@ -315,68 +379,52 @@ public class NebulousAppDeployer {
|
||||
});
|
||||
ObjectNode environment = cluster.withObject("/env-var");
|
||||
environment.put("APPLICATION_ID", appUUID);
|
||||
// TODO: add other environment variables, also from app creation
|
||||
// message (it has an "env" array)
|
||||
log.info("Calling defineCluster", keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
boolean defineClusterSuccess = conn.defineCluster(appUUID, clusterName, cluster);
|
||||
if (!defineClusterSuccess) {
|
||||
log.error("Call to defineCluster failed, blindly continuing...",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
log.error("Call to defineCluster failed for message body {}, aborting deployment",
|
||||
cluster, keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 5. Deploy cluster
|
||||
// Deploy cluster
|
||||
log.info("Calling deployCluster", keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
boolean deployClusterSuccess = conn.deployCluster(appUUID, clusterName);
|
||||
if (!deployClusterSuccess) {
|
||||
log.error("Call to deployCluster failed, blindly continuing...",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
log.error("Call to deployCluster failed, trying to delete cluster and aborting deployment",
|
||||
cluster, keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
conn.deleteCluster(appUUID, clusterName);
|
||||
return;
|
||||
}
|
||||
|
||||
JsonNode clusterState = conn.getCluster(clusterName);
|
||||
while (clusterState == null || !isClusterDeploymentFinished(clusterState)) {
|
||||
// Cluster deployment includes provisioning and booting VMs,
|
||||
// installing various software packages, bringing up a Kubernetes
|
||||
// cluster and installing the NebulOuS runtime. This can take
|
||||
// some minutes.
|
||||
log.info("Waiting for cluster deployment to finish...",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName),
|
||||
keyValue("clusterState", clusterState));
|
||||
try {
|
||||
Thread.sleep(10000);
|
||||
} catch (InterruptedException e1) {
|
||||
// ignore
|
||||
}
|
||||
// TODO: distinguish between clusterState==null because SAL hasn't
|
||||
// set up its datastructures yet, and clusterState==null because
|
||||
// the call to getCluster failed. In the latter case we want to
|
||||
// abort (because someone has deleted the cluster), in the former
|
||||
// case we want to continue.
|
||||
clusterState = conn.getCluster(clusterName);
|
||||
if (!waitForClusterDeploymentFinished(conn, clusterName, appUUID)) {
|
||||
log.error("Error while waiting for deployCluster to finish, trying to delete cluster and aborting deployment",
|
||||
cluster, keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
conn.deleteCluster(appUUID, clusterName);
|
||||
return;
|
||||
}
|
||||
|
||||
log.info("Cluster deployment finished, continuing with app deployment",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName),
|
||||
keyValue("clusterState", clusterState));
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
|
||||
log.info("Calling labelCluster", keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
boolean labelClusterSuccess = conn.labelNodes(appUUID, clusterName, nodeLabels);
|
||||
if (!labelClusterSuccess) {
|
||||
log.error("Call to deployCluster failed, blindly continuing...",
|
||||
log.error("Call to deployCluster failed, aborting deployment",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
conn.deleteCluster(appUUID, clusterName);
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 6. Rewrite KubeVela
|
||||
JsonNode rewritten = createDeploymentKubevela(kubevela);
|
||||
String rewritten_kubevela = "---\n# Did not manage to create rewritten KubeVela";
|
||||
try {
|
||||
rewritten_kubevela = yamlMapper.writeValueAsString(rewritten);
|
||||
} catch (JsonProcessingException e) {
|
||||
log.error("Failed to convert KubeVela to YAML; this should never happen",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName), e);
|
||||
}
|
||||
Main.logFile("rewritten-kubevela-" + appUUID + ".yaml", rewritten_kubevela);
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 7. Deploy application
|
||||
// Deploy application
|
||||
|
||||
log.info("Calling deployApplication", keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
long proActiveJobID = conn.deployApplication(appUUID, clusterName, safeAppName, rewritten_kubevela);
|
||||
@ -385,19 +433,16 @@ public class NebulousAppDeployer {
|
||||
if (proActiveJobID == 0) {
|
||||
// 0 means conversion from long has failed (because of an invalid
|
||||
// response), OR a ProActive job id of 0.
|
||||
log.warn("Job ID = 0, this means that deployApplication has probably failed.",
|
||||
log.error("DeployApplication ProActive job ID = 0, deployApplication has probably failed; aborting deployment.",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
app.setStateFailed();
|
||||
conn.deleteCluster(appUUID, clusterName);
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// 8. Update NebulousApp state
|
||||
// Update NebulousApp state
|
||||
|
||||
// TODO: send out AMPL (must be done after deployCluster, once we know
|
||||
// how to pass the application id into the fresh cluster)
|
||||
|
||||
app.setComponentRequirements(componentRequirements);
|
||||
app.setComponentReplicaCounts(nodeCounts);
|
||||
app.setDeployedKubevela(rewritten);
|
||||
app.setStateDeploymentFinished(componentRequirements, nodeCounts, componentNodeNames, nodeEdgeCandidates, rewritten);
|
||||
log.info("App deployment finished.",
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
}
|
||||
@ -417,11 +462,10 @@ public class NebulousAppDeployer {
|
||||
public static void redeployApplication(NebulousApp app, ObjectNode kubevela) {
|
||||
String appUUID = app.getUUID();
|
||||
String clusterName = app.getClusterName();
|
||||
int deployGeneration = app.getDeployGeneration() + 1;
|
||||
ExnConnector conn = app.getExnConnector();
|
||||
app.setDeployGeneration(deployGeneration);
|
||||
app.setStateDeploying();
|
||||
|
||||
log.info("Starting redeployment generation {}", deployGeneration,
|
||||
log.info("Starting redeployment generation {}", app.getDeployGeneration(),
|
||||
keyValue("appId", appUUID), keyValue("clusterName", clusterName));
|
||||
// The overall flow:
|
||||
//
|
||||
@ -476,7 +520,7 @@ public class NebulousAppDeployer {
|
||||
continue;
|
||||
}
|
||||
for (int nodeNumber = 1; nodeNumber <= nAdd; nodeNumber++) {
|
||||
String nodeName = createNodeName(clusterName, componentName, deployGeneration, nodeNumber);
|
||||
String nodeName = createNodeName(clusterName, componentName, app.getDeployGeneration(), nodeNumber);
|
||||
NodeCandidate candidate = candidates.stream()
|
||||
.filter(each -> !app.getNodeEdgeCandidates().values().contains(each))
|
||||
.findFirst()
|
||||
@ -532,7 +576,7 @@ public class NebulousAppDeployer {
|
||||
continue;
|
||||
}
|
||||
for (int nodeNumber = 1; nodeNumber <= componentReplicaCounts.get(componentName); nodeNumber++) {
|
||||
String nodeName = createNodeName(clusterName, componentName, deployGeneration, nodeNumber);
|
||||
String nodeName = createNodeName(clusterName, componentName, app.getDeployGeneration(), nodeNumber);
|
||||
NodeCandidate candidate = candidates.stream()
|
||||
.filter(each -> !app.getNodeEdgeCandidates().values().contains(each))
|
||||
.findFirst()
|
||||
|
Loading…
x
Reference in New Issue
Block a user