Graceful error handling via workflow commands

Workflow rules can now define "failure" blocks to handle the various
errors and exceptions occurred while executing the actions.
These blocks are passed to 'update-cf-stack' and 'send-command'
functions, so their callbacks can call them to handle the errors and
exceptions. The actual error and exception data may be passed via
context in the same way as result is passed to "success" handlers

If 'failure" block is skipped, the global exception is risen, which
interrupts the workflow execution and reports an unhandled error to
the API at the error level.
To gracefully stop the workflow execution without throwing an exception,
a 'failure' block may define a '<stop/>' command, which interrupts the
execution after the end of current loop and without throwing any
exceptions

These changes allow to handle exceptions and unexpected states while
interacting with heat, as well as agent-side exceptions delivered to
conductor from Agent. Also, the same approach works for handling a
timeout while waiting for task result from the Agent.

To support timeouts a 'send-command' function must be passed with a
'timeout' parameter. If it is skipped, the timeout is considered to
be infinite.

The workflows have been update with failure blocks on all the
commands. These blocks contain error-level reporting and <stop/>
command to interrupt the flow.
No timeouts were set in workflows, so timeout feature is currently
inactive for the existing workflows (as the
'inititialization timeout' concept needs to be introduced)

Change-Id: Ia791d4656463240ed197bcd90b9d9eae648270af
This commit is contained in:
Alexander Tivelkov 2013-08-05 20:43:54 +04:00
parent 4a0d31c09e
commit 8203a6ac07
11 changed files with 412 additions and 104 deletions

View File

@ -12,7 +12,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating instance <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<update-cf-stack template="Windows">
<update-cf-stack template="Windows" error="exception">
<parameter name="mappings">
<map>
<mapping name="instanceName"><select path="state.hostname"/></mapping>
@ -40,12 +40,19 @@
<parameter name="text">Instance <select path="state.hostname"/> (<select path="name"/>) created</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to deploy instance <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="message" default="unknown Heat error"/> </parameter>
</report>
<stop/>
</failure>
</update-cf-stack>
</rule>
<rule match="$.services[?(@.type == 'activeDirectory')].units[?(@.temp.instanceName and @.adminPassword and @.adminPassword != @.state.adminPassword)]"
desc="Units of AD services which have got instances deployed but the local admin passwords not set yet">
<send-command template="SetPassword">
<send-command template="SetPassword" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -64,12 +71,19 @@
<select path="adminPassword"/>
</set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to set admin password on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'activeDirectory' and @.adminPassword and @.adminPassword != @.state.domainAdminPassword)].units[?(@.temp.instanceName and @.isMaster)]"
desc="Deployed master-units of AD services for which the domain admin password is not set yet">
<send-command template="SetPassword">
<send-command template="SetPassword" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -88,6 +102,13 @@
<select path="::adminPassword"/>
</set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to set domain administrator password on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
@ -97,7 +118,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating Primary Domain Controller on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<send-command template="CreatePrimaryDC">
<send-command template="CreatePrimaryDC" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -121,12 +142,19 @@
<parameter name="text">Primary Domain Controller created</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to create a Primary DC on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'activeDirectory' and @.state.primaryDc and not @.state.primaryDcIp)].units[?(@.temp.instanceName and @.isMaster)]"
desc="Master Units of AD services on which the Primary Domain Controller has been configured but DNS ip has not been asked for">
<send-command template="AskDnsIp" result="ip">
<send-command template="AskDnsIp" result="ip" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -138,12 +166,19 @@
<select source="ip" path="0.Result.0"/>
</set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable assign DNS IP on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type != 'activeDirectory')].units[?(@.state.domain and not @.domain)]"
desc="Any non-AD services of the environment which has been part of the domain but needs to leave it">
<send-command template="LeaveDomain">
<send-command template="LeaveDomain" error="exception">
<parameter name="unit">
<select path="id" source="unit"/>
</parameter>
@ -165,6 +200,13 @@
</report>
<set path="state.domain"><null/></set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unit <select path="state.hostname" source="unit"/> (<select path="name" source="unit"/>) was unable to leave the domain due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
@ -178,7 +220,7 @@
</set>
<rule desc="Domain controller exists with the assigned DNS IP">
<parameter name="match">/$.services[?(@.type == 'activeDirectory' and @.domain == '<select path="domain"/>' and @.state.primaryDcIp)]</parameter>
<send-command template="JoinDomain">
<send-command template="JoinDomain" error="exception">
<parameter name="unit">
<select path="id" source="unit"/>
</parameter>
@ -212,6 +254,13 @@
<parameter name="text">Unit <select path="state.hostname" source="unit"/> (<select path="name" source="unit"/>) has joined domain <select path="domain"/></parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unit <select path="state.hostname" source="unit"/> (<select path="name" source="unit"/>) was unable to join the domain due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
</rule>
@ -223,7 +272,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating Secondary Domain Controller on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<send-command template="CreateSecondaryDC">
<send-command template="CreateSecondaryDC" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -251,6 +300,17 @@
<parameter name="text">Domain <select path="::domain"/> created</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to create Secondary Domain Controller on unit <select path="state.hostname" source="unit"/> (<select path="name" source="unit"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<report entity="service" level="error">
<parameter name="id"><select path="::id"/></parameter>
<parameter name="text">Unable to create domain <select path="::domain"/></parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
</workflow>

View File

@ -12,7 +12,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating instance <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<update-cf-stack template="Windows">
<update-cf-stack template="Windows" error="exception">
<parameter name="mappings">
<map>
<mapping name="instanceName"><select path="state.hostname"/></mapping>
@ -40,12 +40,19 @@
<parameter name="text">Instance <select path="state.hostname"/> (<select path="name"/>) created</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to deploy instance <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="message" default="unknown Heat error"/> </parameter>
</report>
<stop/>
</failure>
</update-cf-stack>
</rule>
<rule match="$.services[?(@.type in ('webServerFarm', 'aspNetAppFarm'))].units[?(@.state.hostname and not @.temp.registeredWithLB)]"
desc="Units of web-farms services which have a hostname assigned but are not registered with LB">
<update-cf-stack template="LoadBalancer" result="outputs">
<update-cf-stack template="LoadBalancer" result="outputs" error="exception">
<parameter name="mappings">
<map>
<mapping name="instanceName"><select path="state.hostname"/></mapping>
@ -57,12 +64,19 @@
<set path="temp.registeredWithLB"><true/></set>
<set path="::uri">http://<select source="outputs" path="LoadBalancerIP"/>:<select path="::loadBalancerPort"/></set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to create a Server Farm load balancer on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="message" default="unknown Heat error"/> </parameter>
</report>
<stop/>
</failure>
</update-cf-stack>
</rule>
<rule match="$.services[?(@.type in ('webServer', 'aspNetApp', 'webServerFarm', 'aspNetAppFarm') and @.adminPassword and @.adminPassword != @.state.adminPassword)].units[?(@.temp.instanceName)]"
desc="Units of web services which have got an instance deployed but has not got a correct admin password ">
<send-command template="SetPassword">
<send-command template="SetPassword" error='exception'>
<parameter name="unit">
<select path="id"/>
</parameter>
@ -81,6 +95,13 @@
<select path="::adminPassword"/>
</set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to set admin password on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
@ -91,7 +112,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating IIS Web Server on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<send-command template="InstallIIS">
<send-command template="InstallIIS" error='exception'>
<parameter name="unit">
<select path="id"/>
</parameter>
@ -105,6 +126,13 @@
<parameter name="text">IIS <select path="state.hostname"/> (<select path="name"/>) has started</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to install IIS on <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
@ -114,7 +142,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Deploying WebApp <select path="::name"/> on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<send-command template="DeployWebApp">
<send-command template="DeployWebApp" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -135,6 +163,13 @@
<parameter name="text">WebApp <select path="::name"/> has been deployed on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to deploy WebApp on <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>

View File

@ -43,7 +43,7 @@
<parameter name="match">$[?(@.state.domain != '<select path="domain" source="ad"/>')]</parameter>
<parameter name="desc">Units which are not part of the target domain but need to join</parameter>
<send-command template="JoinDomain">
<send-command template="JoinDomain" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -79,6 +79,13 @@
<parameter name="text">Unit <select path="state.hostname"/> (<select path="name"/>) has joined domain <select path="domain" source="ad"/></parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unit <select path="state.hostname"/> (<select path="name"/>) was unable to join the domain due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
</rule>

View File

@ -13,7 +13,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating instance <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<update-cf-stack template="Windows">
<update-cf-stack template="Windows" error="exception">
<parameter name="mappings">
<map>
<mapping name="instanceName"><select path="state.hostname"/></mapping>
@ -41,6 +41,13 @@
<parameter name="text">Instance <select path="state.hostname"/> (<select path="name"/>) created</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to deploy instance <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="message" default="unknown Heat error"/> </parameter>
</report>
<stop/>
</failure>
</update-cf-stack>
</rule>
@ -58,7 +65,7 @@
<rule match="$.services[?(@.type == 'msSqlClusterServer' and @.adminPassword and @.adminPassword != @.state.adminPassword)].units[?(@.temp.instanceName)]"
desc="Units of SQL Server Cluster services which have got an instance deployed but has not got a correct admin password">
<send-command template="SetPassword">
<send-command template="SetPassword" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -77,12 +84,19 @@
<select path="::adminPassword"/>
</set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to set admin password on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer')].units[?(@.state.domain and not @.state.failoverClusterPrerequisitesInstalled)]"
desc="Units of SQL Server Cluster services that are already joined AD domain">
<send-command template="SqlServerCluster/FailoverClusterPrerequisites">
<send-command template="SqlServerCluster/FailoverClusterPrerequisites" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -109,12 +123,19 @@
<parameter name="text">Failover cluster prerequisites installed on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to install prerequisites on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer' and not @.state.failoverClusterCreated)].units[?(@.state.failoverClusterPrerequisitesInstalled)]" limit="1"
desc="First unit of SQL Server Cluster services that is already has failover cluster prerequisites installed">
<send-command template="SqlServerCluster/FailoverCluster">
<send-command template="SqlServerCluster/FailoverCluster" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -147,12 +168,19 @@
<parameter name="text">Failover cluster created for SQL Server Cluster service (<select path="::name"/>)</parameter>
</report>
</success>
<failure>
<report entity="Service" level="error">
<parameter name="id"><select path="::id"/></parameter>
<parameter name="text">Unable to create failover cluster for SQL Server Service <select path="::name"/> due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer' and @.state.failoverClusterCreated and not @.state.agEnvironmentConfigured)].units[*]" limit="1"
desc="First unit of SQL Server Cluster services that is already has failover cluster created">
<send-command template="SqlServerCluster/ConfigureEnvironmentForAOAG">
<send-command template="SqlServerCluster/ConfigureEnvironmentForAOAG" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -188,12 +216,19 @@
<parameter name="text">Environment for AlwaysOn Availability Group of SQL Server Cluster service (<select path="::name"/>) configured</parameter>
</report>
</success>
<failure>
<report entity="service" level="error">
<parameter name="id"><select path="::id"/></parameter>
<parameter name="text">Unable to configure the environment for AlwaysOn Availability Group of SQL Server Cluster service (<select path="::name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer' and @.state.agEnvironmentConfigured)].units[?(@.state.failoverClusterPrerequisitesInstalled and not @.state.sqlServerInstalled)]"
desc="All units of SQL Server Cluster services that is already has environment configured">
<send-command template="SqlServerCluster/InstallSqlServerForAOAG">
<send-command template="SqlServerCluster/InstallSqlServerForAOAG" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -220,12 +255,19 @@
<parameter name="text">SQL Server installed on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to install SQL Server on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer')].units[?(@.state.sqlServerInstalled and not @.state.alwaysOnInitialized)]"
desc="All units of SQL Server Cluster services that has SQL Server installed">
<send-command template="SqlServerCluster/InitializeAlwaysOn">
<send-command template="SqlServerCluster/InitializeAlwaysOn" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -258,12 +300,19 @@
<parameter name="text">AlwaysOn AG initialized for <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to initialize AlwaysOn AG for <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer')].units[?(@.state.alwaysOnInitialized and not @.state.primaryReplicaInitialized)]"
desc="All units of SQL Server Cluster services that has AlwaysOn initialized">
<send-command template="SqlServerCluster/InitializeAOAGPrimaryReplica">
<send-command template="SqlServerCluster/InitializeAOAGPrimaryReplica" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -311,12 +360,19 @@
<parameter name="text">Primary replica for SQL Server AG initialized for <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to initialize primary replica for SQL Server AG for <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
<rule match="$.services[?(@.type == 'msSqlClusterServer')].units[?(@.state.primaryReplicaInitialized and not @.state.secondaryReplicaInitialized)]"
desc="All units of SQL Server Cluster services that has primary replica initialized">
<send-command template="SqlServerCluster/InitializeAOAGSecondaryReplica">
<send-command template="SqlServerCluster/InitializeAOAGSecondaryReplica" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -349,6 +405,13 @@
<parameter name="text">Secondary replica for SQL Server AG initialized for <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to initialize secondary replica for SQL Server AG for <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>

View File

@ -12,7 +12,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating instance <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<update-cf-stack template="Windows">
<update-cf-stack template="Windows" error="exception">
<parameter name="mappings">
<map>
<mapping name="instanceName"><select path="state.hostname"/></mapping>
@ -40,12 +40,19 @@
<parameter name="text">Instance <select path="state.hostname"/> (<select path="name"/>) created</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to deploy instance <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="message" default="unknown Heat error"/> </parameter>
</report>
<stop/>
</failure>
</update-cf-stack>
</rule>
<rule match="$.services[?(@.type == 'msSqlServer' and @.adminPassword and @.adminPassword != @.state.adminPassword)].units[?(@.temp.instanceName)]"
desc="Units of SQL Server services which have got an instance deployed but has not got a correct admin password">
<send-command template="SetPassword">
<send-command template="SetPassword" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -64,6 +71,13 @@
<select path="::adminPassword"/>
</set>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to set admin password on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>
@ -74,7 +88,7 @@
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Creating MS SQL Server on unit <select path="state.hostname"/> (<select path="name"/>)</parameter>
</report>
<send-command template="InstallMsSqlServer">
<send-command template="InstallMsSqlServer" error="exception">
<parameter name="unit">
<select path="id"/>
</parameter>
@ -99,6 +113,13 @@
<parameter name="text">MS SQL Server <select path="state.hostname"/> (<select path="name"/>) has started</parameter>
</report>
</success>
<failure>
<report entity="unit" level="error">
<parameter name="id"><select path="id"/></parameter>
<parameter name="text">Unable to install MS SQL Server on unit <select path="state.hostname"/> (<select path="name"/>) due to <select source="exception" path="0.messages.0" default="unknown Agent error"/> </parameter>
</report>
<stop/>
</failure>
</send-command>
</rule>

View File

@ -77,7 +77,6 @@ class ConductorWorkflowService(service.Service):
def _task_received(self, message):
task = message.body or {}
message_id = message.id
reporter = None
with self.create_rmq_client() as mq:
try:
log.info('Starting processing task {0}: {1}'.format(
@ -96,7 +95,8 @@ class ConductorWorkflowService(service.Service):
reporter)
workflows.append(workflow)
while True:
stop = False
while not stop:
try:
while True:
result = False
@ -112,17 +112,18 @@ class ConductorWorkflowService(service.Service):
log.debug("No pending commands found, "
"seems like we are done")
break
if self.check_stop_requested(task):
log.info("Workflow stop requested")
stop = True
except Exception as ex:
reporter.report_generic(
"Unexpected error has occurred", ex.message,
'error')
log.exception(ex)
break
command_dispatcher.close()
except reporting.ReportedException as e:
log.exception("Exception has occurred and was reported to API")
except Exception as e:
log.exception("Unexpected exception has occurred")
if reporter:
reporter.report_generic("Unexpected error has occurred",
e.message, 'error')
if stop:
log.info("Workflow stopped by 'stop' command")
finally:
self.cleanup(task, reporter)
result_msg = Message()
@ -156,3 +157,9 @@ class ConductorWorkflowService(service.Service):
if reporter:
reporter.report_generic("Unexpected error has occurred",
e.message, 'error')
def check_stop_requested(self, model):
if 'temp' in model:
if '_stop_requested' in model['temp']:
return model['temp']['_stop_requested']
return False

View File

@ -20,14 +20,35 @@ import string
import time
import xml_code_engine
from openstack.common import log as logging
log = logging.getLogger(__name__)
def update_cf_stack(engine, context, body, template, result=None, **kwargs):
def update_cf_stack(engine, context, body, template, result=None, error=None,
**kwargs):
command_dispatcher = context['/commandDispatcher']
def callback(result_value):
def callback(result_value, error_result=None):
if result is not None:
context[result] = result_value
if error_result is not None:
if error is not None:
context[error] = {
'message': getattr(error_result, 'message', None),
'strerror': getattr(error_result, 'strerror', None),
'timestamp': time.time()
}
failure_handler = body.find('failure')
if failure_handler is not None:
log.warning("Handling exception in failure block")
engine.evaluate_content(failure_handler, context)
return
else:
log.error("No failure block found for exception")
raise error_result
success_handler = body.find('success')
if success_handler is not None:
engine.evaluate_content(success_handler, context)

View File

@ -93,64 +93,64 @@ class HeatExecutor(CommandBase):
self._delete_pending_list) > 0
def execute_pending(self):
try:
r1 = self._execute_pending_updates()
r2 = self._execute_pending_deletes()
except Exception as e:
self._reporter.report_generic("Unable to execute Heat command",
e.message, "error")
trace = sys.exc_info()[2]
raise ReportedException(e.message), None, trace
r1 = self._execute_pending_updates()
r2 = self._execute_pending_deletes()
return r1 or r2
def _execute_pending_updates(self):
if not len(self._update_pending_list):
return False
template, arguments = self._get_current_template()
stack_exists = (template != {})
try:
template, arguments = self._get_current_template()
stack_exists = (template != {})
# do not need to merge with current stack cause we rebuilding it
# from scratch on every deployment
template, arguments = ({}, {})
# do not need to merge with current stack cause we rebuilding it from
# scratch on every deployment
template, arguments = ({}, {})
for t in self._update_pending_list:
template = muranoconductor.helpers.merge_dicts(template,
t['template'])
arguments = muranoconductor.helpers.merge_dicts(arguments,
t['arguments'],
max_levels=1)
log.info(
'Executing heat template {0} with arguments {1} on stack {2}'
.format(anyjson.dumps(template), arguments, self._stack))
for t in self._update_pending_list:
template = muranoconductor.helpers.merge_dicts(
template, t['template'])
arguments = muranoconductor.helpers.merge_dicts(
arguments, t['arguments'], max_levels=1)
if stack_exists:
self._heat_client.stacks.update(
stack_id=self._stack,
parameters=arguments,
template=template)
log.debug(
'Waiting for the stack {0} to be update'.format(
self._stack))
outs = self._wait_state('UPDATE_COMPLETE')
log.info('Stack {0} updated'.format(self._stack))
else:
self._heat_client.stacks.create(
stack_name=self._stack,
parameters=arguments,
template=template)
log.info(
'Executing heat template {0} with arguments {1} on stack {2}'
.format(anyjson.dumps(template), arguments, self._stack))
log.debug('Waiting for the stack {0} to be create'.format(
self._stack))
outs = self._wait_state('CREATE_COMPLETE')
log.info('Stack {0} created'.format(self._stack))
if stack_exists:
self._heat_client.stacks.update(
stack_id=self._stack,
parameters=arguments,
template=template)
log.debug(
'Waiting for the stack {0} to be update'.format(self._stack))
outs = self._wait_state('UPDATE_COMPLETE')
log.info('Stack {0} updated'.format(self._stack))
else:
self._heat_client.stacks.create(
stack_name=self._stack,
parameters=arguments,
template=template)
pending_list = self._update_pending_list
self._update_pending_list = []
log.debug('Waiting for the stack {0} to be create'.format(
self._stack))
outs = self._wait_state('CREATE_COMPLETE')
log.info('Stack {0} created'.format(self._stack))
pending_list = self._update_pending_list
self._update_pending_list = []
for item in pending_list:
item['callback'](outs)
return True
for item in pending_list:
item['callback'](outs)
return True
except Exception as ex:
pending_list = self._update_pending_list
self._update_pending_list = []
for item in pending_list:
item['callback'](None, ex)
return True
def _execute_pending_deletes(self):
if not len(self._delete_pending_list):

View File

@ -18,7 +18,8 @@ class WindowsAgentExecutor(CommandBase):
self._reporter = reporter
rmqclient.declare(self._results_queue)
def execute(self, template, mappings, unit, service, callback):
def execute(self, template, mappings, unit, service, callback,
timeout=None):
with open('data/templates/agent/%s.template' % template) as t_file:
template_data = t_file.read()
@ -29,7 +30,8 @@ class WindowsAgentExecutor(CommandBase):
queue = ('%s-%s-%s' % (self._stack, service, unit)).lower()
self._pending_list.append({
'id': msg_id,
'callback': callback
'callback': callback,
'timeout': timeout
})
msg = Message()
@ -49,15 +51,53 @@ class WindowsAgentExecutor(CommandBase):
with self._rmqclient.open(self._results_queue) as subscription:
while self.has_pending_commands():
log.debug("Waiting for responses to be returned by the agent. "
"%i total responses remain", len(self._pending_list))
msg = subscription.get_message()
msg.ack()
msg_id = msg.id.lower()
item, index = muranoconductor.helpers.find(
lambda t: t['id'] == msg_id, self._pending_list)
if item:
self._pending_list.pop(index)
item['callback'](msg.body)
# TODO: Add extended initialization timeout
# By now, all the timeouts are defined by the command input
# however, the first reply which we wait for being returned
# from the unit may be delayed due to long unit initialization
# and startup. So, for the nonitialized units we need to extend
# the command's timeout with the initialization timeout
timeout = self.get_max_timeout()
if timeout:
span_message = "for {0} seconds".format(timeout)
else:
span_message = 'infinitely'
log.debug("Waiting %s for responses to be returned"
" by the agent. %i total responses remain",
span_message, len(self._pending_list))
msg = subscription.get_message(timeout=timeout)
if msg:
msg.ack()
msg_id = msg.id.lower()
item, index = muranoconductor.helpers.find(
lambda t: t['id'] == msg_id, self._pending_list)
if item:
self._pending_list.pop(index)
item['callback'](msg.body)
else:
while self.has_pending_commands():
item = self._pending_list.pop()
item['callback'](AgentTimeoutException(timeout))
return True
def get_max_timeout(self):
res = 0
for item in self._pending_list:
if item['timeout'] is None: # if at least 1 item has no timeout
return None # then return None (i.e. infinite)
res = max(res, item['timeout'])
return res
class AgentTimeoutException(Exception):
def __init__(self, timeout):
self.message = "Unable to receive any response from the agent" \
" in {0} sec".format(timeout)
self.timeout = timeout
class UnhandledAgentException(Exception):
def __init__(self, errors):
self.message = "An unhandled exception has " \
"occurred in the Agent: {0}".format(errors)
self.errors = errors

View File

@ -12,33 +12,77 @@
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from muranoconductor.commands.windows_agent import AgentTimeoutException
from muranoconductor.commands.windows_agent import UnhandledAgentException
import xml_code_engine
from openstack.common import log as logging
log = logging.getLogger(__name__)
def send_command(engine, context, body, template, service, unit, mappings=None,
result=None, **kwargs):
result=None, error=None, timeout=None, **kwargs):
if not mappings:
mappings = {}
command_dispatcher = context['/commandDispatcher']
if timeout:
timeout = int(timeout)
def callback(result_value):
log.info(
'Received result from {2} for {0}: {1}'.format(
template, result_value, unit))
if result is not None:
context[result] = result_value['Result']
ok = []
errors = []
if isinstance(result_value, AgentTimeoutException):
errors.append({
'type': "timeout",
'messages': [result_value.message],
'timeout': result_value.timeout
})
else:
if result_value['IsException']:
msg = "A general exception has occurred in the Agent: " + \
result_value['Result']
errors.append({
'type': "general",
'messages': [msg],
})
success_handler = body.find('success')
if success_handler is not None:
engine.evaluate_content(success_handler, context)
else:
for res in result_value['Result']:
if res['IsException']:
errors.append({
'type': 'inner',
'messages': res['Result']
})
else:
ok.append(res)
if ok:
if result is not None:
context[result] = ok
success_handler = body.find('success')
if success_handler is not None:
engine.evaluate_content(success_handler, context)
if errors:
if error is not None:
context[error] = errors
failure_handler = body.find('failure')
if failure_handler is not None:
engine.evaluate_content(failure_handler, context)
else:
log.error("No failure block found for exception")
if isinstance(result_value, AgentTimeoutException):
raise result_value
else:
raise UnhandledAgentException(errors)
command_dispatcher.execute(
name='agent', template=template, mappings=mappings,
unit=unit, service=service, callback=callback)
unit=unit, service=service, callback=callback, timeout=timeout)
xml_code_engine.XmlCodeEngine.register_function(send_command, "send-command")

View File

@ -203,6 +203,13 @@ class Workflow(object):
return True
return False
@staticmethod
def _stop_func(context, body, engine, **kwargs):
if not 'temp' in context['/dataSource']:
context['/dataSource']['temp'] = {}
context['/dataSource']['temp']['_stop_requested'] = True
xml_code_engine.XmlCodeEngine.register_function(
Workflow._rule_func, 'rule')
@ -216,6 +223,9 @@ xml_code_engine.XmlCodeEngine.register_function(
xml_code_engine.XmlCodeEngine.register_function(
Workflow._select_func, 'select')
xml_code_engine.XmlCodeEngine.register_function(
Workflow._stop_func, 'stop')
xml_code_engine.XmlCodeEngine.register_function(
Workflow._select_all_func, 'select-all')