diff --git a/doc-tools-check-languages.conf b/doc-tools-check-languages.conf
index 9a25534d4e..4d08f4ba13 100644
--- a/doc-tools-check-languages.conf
+++ b/doc-tools-check-languages.conf
@@ -55,6 +55,7 @@ declare -A SPECIAL_BOOKS=(
["config-reference"]="skip"
["contributor-guide"]="skip"
["releasenotes"]="skip"
+ ["ha-guide-draft"]="skip"
# Skip old arch design, will be archived
["arch-design-to-archive"]="skip"
)
diff --git a/doc/ha-guide-draft/setup.cfg b/doc/ha-guide-draft/setup.cfg
new file mode 100644
index 0000000000..44d048a0e1
--- /dev/null
+++ b/doc/ha-guide-draft/setup.cfg
@@ -0,0 +1,27 @@
+[metadata]
+name = openstackhaguide
+summary = OpenStack High Availability Guide
+author = OpenStack
+author-email = openstack-docs@lists.openstack.org
+home-page = https://docs.openstack.org/
+classifier =
+Environment :: OpenStack
+Intended Audience :: Information Technology
+Intended Audience :: System Administrators
+License :: OSI Approved :: Apache Software License
+Operating System :: POSIX :: Linux
+Topic :: Documentation
+
+[global]
+setup-hooks =
+ pbr.hooks.setup_hook
+
+[files]
+
+[build_sphinx]
+warning-is-error = 1
+build-dir = build
+source-dir = source
+
+[wheel]
+universal = 1
diff --git a/doc/ha-guide-draft/setup.py b/doc/ha-guide-draft/setup.py
new file mode 100644
index 0000000000..736375744d
--- /dev/null
+++ b/doc/ha-guide-draft/setup.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# THIS FILE IS MANAGED BY THE GLOBAL REQUIREMENTS REPO - DO NOT EDIT
+import setuptools
+
+# In python < 2.7.4, a lazy loading of package `pbr` will break
+# setuptools if some other modules registered functions in `atexit`.
+# solution from: http://bugs.python.org/issue15881#msg170215
+try:
+ import multiprocessing # noqa
+except ImportError:
+ pass
+
+setuptools.setup(
+ setup_requires=['pbr'],
+ pbr=True)
diff --git a/doc/ha-guide-draft/source/common b/doc/ha-guide-draft/source/common
new file mode 120000
index 0000000000..dc879abe93
--- /dev/null
+++ b/doc/ha-guide-draft/source/common
@@ -0,0 +1 @@
+../../common
\ No newline at end of file
diff --git a/doc/ha-guide-draft/source/compute-node-ha.rst b/doc/ha-guide-draft/source/compute-node-ha.rst
new file mode 100644
index 0000000000..b1fb659269
--- /dev/null
+++ b/doc/ha-guide-draft/source/compute-node-ha.rst
@@ -0,0 +1,55 @@
+============================
+Configuring the compute node
+============================
+
+The `Installation Tutorials and Guides
+`_
+provide instructions for installing multiple compute nodes.
+To make the compute nodes highly available, you must configure the
+environment to include multiple instances of the API and other services.
+
+Configuring high availability for instances
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As of September 2016, the OpenStack High Availability community is
+designing and developing an official and unified way to provide high
+availability for instances. We are developing automatic
+recovery from failures of hardware or hypervisor-related software on
+the compute node, or other failures that could prevent instances from
+functioning correctly, such as, issues with a cinder volume I/O path.
+
+More details are available in the `user story
+`_
+co-authored by OpenStack's HA community and `Product Working Group
+`_ (PWG), where this feature is
+identified as missing functionality in OpenStack, which
+should be addressed with high priority.
+
+Existing solutions
+~~~~~~~~~~~~~~~~~~
+
+The architectural challenges of instance HA and several currently
+existing solutions were presented in `a talk at the Austin summit
+`_,
+for which `slides are also available `_.
+
+The code for three of these solutions can be found online at the following
+links:
+
+* `a mistral-based auto-recovery workflow
+ `_, by Intel
+* `masakari `_, by NTT
+* `OCF RAs
+ `_,
+ as used by Red Hat and SUSE
+
+Current upstream work
+~~~~~~~~~~~~~~~~~~~~~
+
+Work is in progress on a unified approach, which combines the best
+aspects of existing upstream solutions. More details are available on
+`the HA VMs user story wiki
+`_.
+
+To get involved with this work, see the section on the
+:doc:`ha-community`.
diff --git a/doc/ha-guide-draft/source/conf.py b/doc/ha-guide-draft/source/conf.py
new file mode 100644
index 0000000000..6500ece076
--- /dev/null
+++ b/doc/ha-guide-draft/source/conf.py
@@ -0,0 +1,301 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import os
+# import sys
+
+import openstackdocstheme
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+# templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'High Availability Guide Draft'
+bug_tag = u'ha-guide-draft'
+copyright = u'2017, OpenStack contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.0.1'
+
+# A few variables have to be set for the log-a-bug feature.
+# giturl: The location of conf.py on Git. Must be set manually.
+# gitsha: The SHA checksum of the bug description. Automatically extracted from git log.
+# bug_tag: Tag for categorizing the bug. Must be set manually.
+# These variables are passed to the logabug code via html_context.
+giturl = u'https://git.openstack.org/cgit/openstack/openstack-manuals/tree/doc/ha-guide-draft/source'
+git_cmd = "/usr/bin/git log | head -n1 | cut -f2 -d' '"
+gitsha = os.popen(git_cmd).read().strip('\n')
+html_context = {"gitsha": gitsha, "bug_tag": bug_tag,
+ "giturl": giturl}
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['common/cli*', 'common/nova*',
+ 'common/get-started*', 'common/dashboard*']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'openstackdocs'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = [openstackdocstheme.get_html_theme_path()]
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# " v documentation".
+# html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+# html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = []
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+# html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+# So that we can enable "log-a-bug" links from each output HTML page, this
+# variable must be set to a format that includes year, month, day, hours and
+# minutes.
+html_last_updated_fmt = '%Y-%m-%d %H:%M'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+# html_domain_indices = True
+
+# If false, no index is generated.
+html_use_index = False
+
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+html_show_sourcelink = False
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'ha-guide-draft'
+
+# If true, publish source files
+html_copy_source = False
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_engine = 'xelatex'
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ # 'papersize': 'letterpaper',
+
+ # set font (TODO: different fonts for translated PDF document builds)
+ 'fontenc': '\\usepackage{fontspec}',
+ 'fontpkg': '''\
+\defaultfontfeatures{Scale=MatchLowercase}
+\setmainfont{Liberation Serif}
+\setsansfont{Liberation Sans}
+\setmonofont[SmallCapsFont={Liberation Mono}]{Liberation Mono}
+''',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ # 'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ ('index', 'HAGuideDraft.tex', u'High Availability Guide Draft',
+ u'OpenStack contributors', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+
+# If false, no module index is generated.
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'haguidedraft', u'High Availability GuideDraft',
+ [u'OpenStack contributors'], 1)
+]
+
+# If true, show URL addresses after external links.
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'HAGuideDraft', u'High Availability Guide Draft',
+ u'OpenStack contributors', 'HAGuideDraft',
+ 'This guide shows OpenStack operators and deployers how to configure'
+ 'OpenStack Networking to be robust and fault-tolerant.', 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+# texinfo_no_detailmenu = False
+
+# -- Options for Internationalization output ------------------------------
+locale_dirs = ['locale/']
diff --git a/doc/ha-guide-draft/source/control-plane-stateful.rst b/doc/ha-guide-draft/source/control-plane-stateful.rst
new file mode 100644
index 0000000000..26bfdcea40
--- /dev/null
+++ b/doc/ha-guide-draft/source/control-plane-stateful.rst
@@ -0,0 +1,342 @@
+=================================
+Configuring the stateful services
+=================================
+.. to do: scope how in depth we want these sections to be
+
+Database for high availability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Galera
+------
+
+The first step is to install the database that sits at the heart of the
+cluster. To implement high availability, run an instance of the database on
+each controller node and use Galera Cluster to provide replication between
+them. Galera Cluster is a synchronous multi-master database cluster, based
+on MySQL and the InnoDB storage engine. It is a high-availability service
+that provides high system uptime, no data loss, and scalability for growth.
+
+You can achieve high availability for the OpenStack database in many
+different ways, depending on the type of database that you want to use.
+There are three implementations of Galera Cluster available to you:
+
+- `Galera Cluster for MySQL `_: The MySQL
+ reference implementation from Codership, Oy.
+- `MariaDB Galera Cluster `_: The MariaDB
+ implementation of Galera Cluster, which is commonly supported in
+ environments based on Red Hat distributions.
+- `Percona XtraDB Cluster `_: The XtraDB
+ implementation of Galera Cluster from Percona.
+
+In addition to Galera Cluster, you can also achieve high availability
+through other database options, such as PostgreSQL, which has its own
+replication system.
+
+Pacemaker active/passive with HAproxy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Replicated storage
+------------------
+
+For example: DRBD
+
+Shared storage
+--------------
+
+Messaging service for high availability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+RabbitMQ
+--------
+
+An AMQP (Advanced Message Queuing Protocol) compliant message bus is
+required for most OpenStack components in order to coordinate the
+execution of jobs entered into the system.
+
+The most popular AMQP implementation used in OpenStack installations
+is RabbitMQ.
+
+RabbitMQ nodes fail over on the application and the infrastructure layers.
+
+The application layer is controlled by the ``oslo.messaging``
+configuration options for multiple AMQP hosts. If the AMQP node fails,
+the application reconnects to the next one configured within the
+specified reconnect interval. The specified reconnect interval
+constitutes its SLA.
+
+On the infrastructure layer, the SLA is the time for which RabbitMQ
+cluster reassembles. Several cases are possible. The Mnesia keeper
+node is the master of the corresponding Pacemaker resource for
+RabbitMQ. When it fails, the result is a full AMQP cluster downtime
+interval. Normally, its SLA is no more than several minutes. Failure
+of another node that is a slave of the corresponding Pacemaker
+resource for RabbitMQ results in no AMQP cluster downtime at all.
+
+.. until we've determined the content depth, I've transferred RabbitMQ
+ configuration below from the old HA guide (darrenc)
+
+Making the RabbitMQ service highly available involves the following steps:
+
+- :ref:`Install RabbitMQ`
+
+- :ref:`Configure RabbitMQ for HA queues`
+
+- :ref:`Configure OpenStack services to use RabbitMQ HA queues
+ `
+
+.. note::
+
+ Access to RabbitMQ is not normally handled by HAProxy. Instead,
+ consumers must be supplied with the full list of hosts running
+ RabbitMQ with ``rabbit_hosts`` and turn on the ``rabbit_ha_queues``
+ option. For more information, read the `core issue
+ `_.
+ For more detail, read the `history and solution
+ `_.
+
+.. _rabbitmq-install:
+
+Install RabbitMQ
+^^^^^^^^^^^^^^^^
+
+The commands for installing RabbitMQ are specific to the Linux distribution
+you are using.
+
+For Ubuntu or Debian:
+
+.. code-block: console
+
+ # apt-get install rabbitmq-server
+
+For RHEL, Fedora, or CentOS:
+
+.. code-block: console
+
+ # yum install rabbitmq-server
+
+For openSUSE:
+
+.. code-block: console
+
+ # zypper install rabbitmq-server
+
+For SLES 12:
+
+.. code-block: console
+
+ # zypper addrepo -f obs://Cloud:OpenStack:Kilo/SLE_12 Kilo
+ [Verify the fingerprint of the imported GPG key. See below.]
+ # zypper install rabbitmq-server
+
+.. note::
+
+ For SLES 12, the packages are signed by GPG key 893A90DAD85F9316.
+ You should verify the fingerprint of the imported GPG key before using it.
+
+ .. code-block:: none
+
+ Key ID: 893A90DAD85F9316
+ Key Name: Cloud:OpenStack OBS Project
+ Key Fingerprint: 35B34E18ABC1076D66D5A86B893A90DAD85F9316
+ Key Created: Tue Oct 8 13:34:21 2013
+ Key Expires: Thu Dec 17 13:34:21 2015
+
+For more information, see the official installation manual for the
+distribution:
+
+- `Debian and Ubuntu `_
+- `RPM based `_
+ (RHEL, Fedora, CentOS, openSUSE)
+
+.. _rabbitmq-configure:
+
+Configure RabbitMQ for HA queues
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. [TODO: This section should begin with a brief mention
+.. about what HA queues are and why they are valuable, etc]
+
+.. [TODO: replace "currently" with specific release names]
+
+.. [TODO: Does this list need to be updated? Perhaps we need a table
+.. that shows each component and the earliest release that allows it
+.. to work with HA queues.]
+
+The following components/services can work with HA queues:
+
+- OpenStack Compute
+- OpenStack Block Storage
+- OpenStack Networking
+- Telemetry
+
+Consider that, while exchanges and bindings survive the loss of individual
+nodes, queues and their messages do not because a queue and its contents
+are located on one node. If we lose this node, we also lose the queue.
+
+Mirrored queues in RabbitMQ improve the availability of service since
+it is resilient to failures.
+
+Production servers should run (at least) three RabbitMQ servers for testing
+and demonstration purposes, however it is possible to run only two servers.
+In this section, we configure two nodes, called ``rabbit1`` and ``rabbit2``.
+To build a broker, ensure that all nodes have the same Erlang cookie file.
+
+.. [TODO: Should the example instead use a minimum of three nodes?]
+
+#. Stop RabbitMQ and copy the cookie from the first node to each of the
+ other node(s):
+
+ .. code-block:: console
+
+ # scp /var/lib/rabbitmq/.erlang.cookie root@NODE:/var/lib/rabbitmq/.erlang.cookie
+
+#. On each target node, verify the correct owner,
+ group, and permissions of the file :file:`erlang.cookie`:
+
+ .. code-block:: console
+
+ # chown rabbitmq:rabbitmq /var/lib/rabbitmq/.erlang.cookie
+ # chmod 400 /var/lib/rabbitmq/.erlang.cookie
+
+#. Start the message queue service on all nodes and configure it to start
+ when the system boots. On Ubuntu, it is configured by default.
+
+ On CentOS, RHEL, openSUSE, and SLES:
+
+ .. code-block:: console
+
+ # systemctl enable rabbitmq-server.service
+ # systemctl start rabbitmq-server.service
+
+#. Verify that the nodes are running:
+
+ .. code-block:: console
+
+ # rabbitmqctl cluster_status
+ Cluster status of node rabbit@NODE...
+ [{nodes,[{disc,[rabbit@NODE]}]},
+ {running_nodes,[rabbit@NODE]},
+ {partitions,[]}]
+ ...done.
+
+#. Run the following commands on each node except the first one:
+
+ .. code-block:: console
+
+ # rabbitmqctl stop_app
+ Stopping node rabbit@NODE...
+ ...done.
+ # rabbitmqctl join_cluster --ram rabbit@rabbit1
+ # rabbitmqctl start_app
+ Starting node rabbit@NODE ...
+ ...done.
+
+.. note::
+
+ The default node type is a disc node. In this guide, nodes
+ join the cluster as RAM nodes.
+
+#. Verify the cluster status:
+
+ .. code-block:: console
+
+ # rabbitmqctl cluster_status
+ Cluster status of node rabbit@NODE...
+ [{nodes,[{disc,[rabbit@rabbit1]},{ram,[rabbit@NODE]}]}, \
+ {running_nodes,[rabbit@NODE,rabbit@rabbit1]}]
+
+ If the cluster is working, you can create usernames and passwords
+ for the queues.
+
+#. To ensure that all queues except those with auto-generated names
+ are mirrored across all running nodes,
+ set the ``ha-mode`` policy key to all
+ by running the following command on one of the nodes:
+
+ .. code-block:: console
+
+ # rabbitmqctl set_policy ha-all '^(?!amq\.).*' '{"ha-mode": "all"}'
+
+More information is available in the RabbitMQ documentation:
+
+- `Highly Available Queues `_
+- `Clustering Guide `_
+
+.. note::
+
+ As another option to make RabbitMQ highly available, RabbitMQ contains the
+ OCF scripts for the Pacemaker cluster resource agents since version 3.5.7.
+ It provides the active/active RabbitMQ cluster with mirrored queues.
+ For more information, see `Auto-configuration of a cluster with
+ a Pacemaker `_.
+
+.. _rabbitmq-services:
+
+Configure OpenStack services to use Rabbit HA queues
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Configure the OpenStack components to use at least two RabbitMQ nodes.
+
+Use these steps to configurate all services using RabbitMQ:
+
+#. RabbitMQ HA cluster ``host:port`` pairs:
+
+ .. code-block:: console
+
+ rabbit_hosts=rabbit1:5672,rabbit2:5672,rabbit3:5672
+
+#. Retry connecting with RabbitMQ:
+
+ .. code-block:: console
+
+ rabbit_retry_interval=1
+
+#. How long to back-off for between retries when connecting to RabbitMQ:
+
+ .. code-block:: console
+
+ rabbit_retry_backoff=2
+
+#. Maximum retries with trying to connect to RabbitMQ (infinite by default):
+
+ .. code-block:: console
+
+ rabbit_max_retries=0
+
+#. Use durable queues in RabbitMQ:
+
+ .. code-block:: console
+
+ rabbit_durable_queues=true
+
+#. Use HA queues in RabbitMQ (``x-ha-policy: all``):
+
+ .. code-block:: console
+
+ rabbit_ha_queues=true
+
+.. note::
+
+ If you change the configuration from an old set-up
+ that did not use HA queues, restart the service:
+
+ .. code-block:: console
+
+ # rabbitmqctl stop_app
+ # rabbitmqctl reset
+ # rabbitmqctl start_app
+
+
+
+
+
+Pacemaker active/passive
+------------------------
+
+
+
+Mirrored queues
+---------------
+
+Qpid
+----
diff --git a/doc/ha-guide-draft/source/control-plane-stateless.rst b/doc/ha-guide-draft/source/control-plane-stateless.rst
new file mode 100644
index 0000000000..2daa2a5be2
--- /dev/null
+++ b/doc/ha-guide-draft/source/control-plane-stateless.rst
@@ -0,0 +1,518 @@
+==============================
+Configuring stateless services
+==============================
+
+.. to do: scope what details we want on the following services
+
+API services
+~~~~~~~~~~~~
+
+Load-balancer
+~~~~~~~~~~~~~
+
+HAProxy
+-------
+
+HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
+for TCP or HTTP applications. It is particularly suited for web crawling
+under very high loads while needing persistence or Layer 7 processing.
+It realistically supports tens of thousands of connections with recent
+hardware.
+
+Each instance of HAProxy configures its front end to accept connections only
+to the virtual IP (VIP) address. The HAProxy back end (termination
+point) is a list of all the IP addresses of instances for load balancing.
+
+.. note::
+
+ Ensure your HAProxy installation is not a single point of failure,
+ it is advisable to have multiple HAProxy instances running.
+
+ You can also ensure the availability by other means, using Keepalived
+ or Pacemaker.
+
+Alternatively, you can use a commercial load balancer, which is hardware
+or software. We recommend a hardware load balancer as it generally has
+good performance.
+
+For detailed instructions about installing HAProxy on your nodes,
+see the HAProxy `official documentation `_.
+
+Configuring HAProxy
+^^^^^^^^^^^^^^^^^^^
+
+#. Restart the HAProxy service.
+
+#. Locate your HAProxy instance on each OpenStack controller in your
+ environment. The following is an example ``/etc/haproxy/haproxy.cfg``
+ configuration file. Configure your instance using the following
+ configuration file, you will need a copy of it on each
+ controller node.
+
+
+ .. code-block:: none
+
+ global
+ chroot /var/lib/haproxy
+ daemon
+ group haproxy
+ maxconn 4000
+ pidfile /var/run/haproxy.pid
+ user haproxy
+
+ defaults
+ log global
+ maxconn 4000
+ option redispatch
+ retries 3
+ timeout http-request 10s
+ timeout queue 1m
+ timeout connect 10s
+ timeout client 1m
+ timeout server 1m
+ timeout check 10s
+
+ listen dashboard_cluster
+ bind :443
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:443 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:443 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:443 check inter 2000 rise 2 fall 5
+
+ listen galera_cluster
+ bind :3306
+ balance source
+ option mysql-check
+ server controller1 10.0.0.12:3306 check port 9200 inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:3306 backup check port 9200 inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:3306 backup check port 9200 inter 2000 rise 2 fall 5
+
+ listen glance_api_cluster
+ bind :9292
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:9292 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:9292 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:9292 check inter 2000 rise 2 fall 5
+
+ listen glance_registry_cluster
+ bind :9191
+ balance source
+ option tcpka
+ option tcplog
+ server controller1 10.0.0.12:9191 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:9191 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:9191 check inter 2000 rise 2 fall 5
+
+ listen keystone_admin_cluster
+ bind :35357
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:35357 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:35357 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:35357 check inter 2000 rise 2 fall 5
+
+ listen keystone_public_internal_cluster
+ bind :5000
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:5000 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:5000 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:5000 check inter 2000 rise 2 fall 5
+
+ listen nova_ec2_api_cluster
+ bind :8773
+ balance source
+ option tcpka
+ option tcplog
+ server controller1 10.0.0.12:8773 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:8773 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:8773 check inter 2000 rise 2 fall 5
+
+ listen nova_compute_api_cluster
+ bind :8774
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:8774 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:8774 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:8774 check inter 2000 rise 2 fall 5
+
+ listen nova_metadata_api_cluster
+ bind :8775
+ balance source
+ option tcpka
+ option tcplog
+ server controller1 10.0.0.12:8775 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:8775 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:8775 check inter 2000 rise 2 fall 5
+
+ listen cinder_api_cluster
+ bind :8776
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:8776 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:8776 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:8776 check inter 2000 rise 2 fall 5
+
+ listen ceilometer_api_cluster
+ bind :8777
+ balance source
+ option tcpka
+ option tcplog
+ server controller1 10.0.0.12:8777 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:8777 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:8777 check inter 2000 rise 2 fall 5
+
+ listen nova_vncproxy_cluster
+ bind :6080
+ balance source
+ option tcpka
+ option tcplog
+ server controller1 10.0.0.12:6080 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:6080 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:6080 check inter 2000 rise 2 fall 5
+
+ listen neutron_api_cluster
+ bind :9696
+ balance source
+ option tcpka
+ option httpchk
+ option tcplog
+ server controller1 10.0.0.12:9696 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:9696 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:9696 check inter 2000 rise 2 fall 5
+
+ listen swift_proxy_cluster
+ bind :8080
+ balance source
+ option tcplog
+ option tcpka
+ server controller1 10.0.0.12:8080 check inter 2000 rise 2 fall 5
+ server controller2 10.0.0.13:8080 check inter 2000 rise 2 fall 5
+ server controller3 10.0.0.14:8080 check inter 2000 rise 2 fall 5
+
+ .. note::
+
+ The Galera cluster configuration directive ``backup`` indicates
+ that two of the three controllers are standby nodes.
+ This ensures that only one node services write requests
+ because OpenStack support for multi-node writes is not yet production-ready.
+
+ .. note::
+
+ The Telemetry API service configuration does not have the ``option httpchk``
+ directive as it cannot process this check properly.
+
+.. TODO: explain why the Telemetry API is so special
+
+#. Configure the kernel parameter to allow non-local IP binding. This allows
+ running HAProxy instances to bind to a VIP for failover. Add following line
+ to ``/etc/sysctl.conf``:
+
+ .. code-block:: none
+
+ net.ipv4.ip_nonlocal_bind = 1
+
+#. Restart the host or, to make changes work immediately, invoke:
+
+ .. code-block:: console
+
+ $ sysctl -p
+
+#. Add HAProxy to the cluster and ensure the VIPs can only run on machines
+ where HAProxy is active:
+
+ ``pcs``
+
+ .. code-block:: console
+
+ $ pcs resource create lb-haproxy systemd:haproxy --clone
+ $ pcs constraint order start vip then lb-haproxy-clone kind=Optional
+ $ pcs constraint colocation add lb-haproxy-clone with vip
+
+ ``crmsh``
+
+ .. code-block:: console
+
+ $ crm cib new conf-haproxy
+ $ crm configure primitive haproxy lsb:haproxy op monitor interval="1s"
+ $ crm configure clone haproxy-clone haproxy
+ $ crm configure colocation vip-with-haproxy inf: vip haproxy-clone
+ $ crm configure order haproxy-after-vip mandatory: vip haproxy-clone
+
+
+Pacemaker versus systemd
+------------------------
+
+Memcached
+---------
+
+Memcached is a general-purpose distributed memory caching system. It
+is used to speed up dynamic database-driven websites by caching data
+and objects in RAM to reduce the number of times an external data
+source must be read.
+
+Memcached is a memory cache demon that can be used by most OpenStack
+services to store ephemeral data, such as tokens.
+
+Access to Memcached is not handled by HAProxy because replicated
+access is currently in an experimental state. Instead, OpenStack
+services must be supplied with the full list of hosts running
+Memcached.
+
+The Memcached client implements hashing to balance objects among the
+instances. Failure of an instance impacts only a percentage of the
+objects and the client automatically removes it from the list of
+instances. The SLA is several minutes.
+
+
+Highly available API services
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Identity API
+------------
+
+Ensure you have read the
+`OpenStack Identity service getting started documentation
+`_.
+
+.. to do: reference controller-ha-identity and see if section involving
+ adding to pacemaker is in scope
+
+
+Add OpenStack Identity resource to Pacemaker
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following section(s) detail how to add the Identity service
+to Pacemaker on SUSE and Red Hat.
+
+SUSE
+----
+
+SUSE Enterprise Linux and SUSE-based distributions, such as openSUSE,
+use a set of OCF agents for controlling OpenStack services.
+
+#. Run the following commands to download the OpenStack Identity resource
+ to Pacemaker:
+
+ .. code-block:: console
+
+ # cd /usr/lib/ocf/resource.d
+ # mkdir openstack
+ # cd openstack
+ # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/keystone
+ # chmod a+rx *
+
+#. Add the Pacemaker configuration for the OpenStack Identity resource
+ by running the following command to connect to the Pacemaker cluster:
+
+ .. code-block:: console
+
+ # crm configure
+
+#. Add the following cluster resources:
+
+ .. code-block:: console
+
+ clone p_keystone ocf:openstack:keystone \
+ params config="/etc/keystone/keystone.conf" os_password="secretsecret" os_username="admin" os_tenant_name="admin" os_auth_url="http://10.0.0.11:5000/v2.0/" \
+ op monitor interval="30s" timeout="30s"
+
+ .. note::
+
+ This configuration creates ``p_keystone``,
+ a resource for managing the OpenStack Identity service.
+
+#. Commit your configuration changes from the :command:`crm configure` menu
+ with the following command:
+
+ .. code-block:: console
+
+ # commit
+
+ The :command:`crm configure` supports batch input. You may have to copy and
+ paste the above lines into your live Pacemaker configuration, and then make
+ changes as required.
+
+ For example, you may enter ``edit p_ip_keystone`` from the
+ :command:`crm configure` menu and edit the resource to match your preferred
+ virtual IP address.
+
+ Pacemaker now starts the OpenStack Identity service and its dependent
+ resources on all of your nodes.
+
+Red Hat
+--------
+
+For Red Hat Enterprise Linux and Red Hat-based Linux distributions,
+the following process uses Systemd unit files.
+
+.. code-block:: console
+
+ # pcs resource create openstack-keystone systemd:openstack-keystone --clone interleave=true
+
+.. _identity-config-identity:
+
+Configure OpenStack Identity service
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+#. Edit the :file:`keystone.conf` file
+ to change the values of the :manpage:`bind(2)` parameters:
+
+ .. code-block:: ini
+
+ bind_host = 10.0.0.12
+ public_bind_host = 10.0.0.12
+ admin_bind_host = 10.0.0.12
+
+ The ``admin_bind_host`` parameter
+ lets you use a private network for admin access.
+
+#. To be sure that all data is highly available,
+ ensure that everything is stored in the MySQL database
+ (which is also highly available):
+
+ .. code-block:: ini
+
+ [catalog]
+ driver = keystone.catalog.backends.sql.Catalog
+ # ...
+ [identity]
+ driver = keystone.identity.backends.sql.Identity
+ # ...
+
+#. If the Identity service will be sending ceilometer notifications
+ and your message bus is configured for high availability, you will
+ need to ensure that the Identity service is correctly configured to
+ use it.
+
+.. _identity-services-config:
+
+Configure OpenStack services to use the highly available OpenStack Identity
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Your OpenStack services now point their OpenStack Identity configuration
+to the highly available virtual cluster IP address.
+
+#. For OpenStack Compute service, (if your OpenStack Identity service
+ IP address is 10.0.0.11) use the following configuration in the
+ :file:`api-paste.ini` file:
+
+ .. code-block:: ini
+
+ auth_host = 10.0.0.11
+
+#. Create the OpenStack Identity Endpoint with this IP address.
+
+ .. note::
+
+ If you are using both private and public IP addresses,
+ create two virtual IP addresses and define the endpoint. For
+ example:
+
+ .. code-block:: console
+
+ $ openstack endpoint create --region $KEYSTONE_REGION \
+ $service-type public http://PUBLIC_VIP:5000/v2.0
+ $ openstack endpoint create --region $KEYSTONE_REGION \
+ $service-type admin http://10.0.0.11:35357/v2.0
+ $ openstack endpoint create --region $KEYSTONE_REGION \
+ $service-type internal http://10.0.0.11:5000/v2.0
+
+#. If you are using Dashboard (horizon), edit the :file:`local_settings.py`
+ file to include the following:
+
+ .. code-block:: ini
+
+ OPENSTACK_HOST = 10.0.0.11
+
+
+Telemetry API
+-------------
+
+The Telemetry polling agent can be configured to partition its polling
+workload between multiple agents. This enables high availability (HA).
+
+Both the central and the compute agent can run in an HA deployment.
+This means that multiple instances of these services can run in
+parallel with workload partitioning among these running instances.
+
+The `Tooz `_ library provides
+the coordination within the groups of service instances.
+It provides an API above several back ends that can be used for building
+distributed applications.
+
+Tooz supports
+`various drivers `_
+including the following back end solutions:
+
+* `Zookeeper `_:
+ Recommended solution by the Tooz project.
+
+* `Redis `_:
+ Recommended solution by the Tooz project.
+
+* `Memcached `_:
+ Recommended for testing.
+
+You must configure a supported Tooz driver for the HA deployment of
+the Telemetry services.
+
+For information about the required configuration options
+to set in the :file:`ceilometer.conf`, see the `coordination section
+`_
+in the OpenStack Configuration Reference.
+
+.. note::
+
+ Only one instance for the central and compute agent service(s) is able
+ to run and function correctly if the ``backend_url`` option is not set.
+
+The availability check of the instances is provided by heartbeat messages.
+When the connection with an instance is lost, the workload will be
+reassigned within the remaining instances in the next polling cycle.
+
+.. note::
+
+ Memcached uses a timeout value, which should always be set to
+ a value that is higher than the heartbeat value set for Telemetry.
+
+For backward compatibility and supporting existing deployments, the central
+agent configuration supports using different configuration files. This is for
+groups of service instances that are running in parallel.
+For enabling this configuration, set a value for the
+``partitioning_group_prefix`` option in the
+`polling section `_
+in the OpenStack Configuration Reference.
+
+.. warning::
+
+ For each sub-group of the central agent pool with the same
+ ``partitioning_group_prefix``, a disjoint subset of meters must be polled
+ to avoid samples being missing or duplicated. The list of meters to poll
+ can be set in the :file:`/etc/ceilometer/pipeline.yaml` configuration file.
+ For more information about pipelines see the `Data processing and pipelines
+ `_
+ section.
+
+To enable the compute agent to run multiple instances simultaneously with
+workload partitioning, the ``workload_partitioning`` option must be set to
+``True`` under the `compute section `_
+in the :file:`ceilometer.conf` configuration file.
+
+
+.. To Do: Cover any other projects here with API services which require specific
+ HA details.
diff --git a/doc/ha-guide-draft/source/control-plane.rst b/doc/ha-guide-draft/source/control-plane.rst
new file mode 100644
index 0000000000..36ede0826b
--- /dev/null
+++ b/doc/ha-guide-draft/source/control-plane.rst
@@ -0,0 +1,9 @@
+===========================
+Configuring a control plane
+===========================
+
+.. toctree::
+ :maxdepth: 2
+
+ control-plane-stateless.rst
+ control-plane-stateful.rst
diff --git a/doc/ha-guide-draft/source/figures/Cluster-deployment-collapsed.png b/doc/ha-guide-draft/source/figures/Cluster-deployment-collapsed.png
new file mode 100644
index 0000000000..91feec0bb1
Binary files /dev/null and b/doc/ha-guide-draft/source/figures/Cluster-deployment-collapsed.png differ
diff --git a/doc/ha-guide-draft/source/figures/Cluster-deployment-segregated.png b/doc/ha-guide-draft/source/figures/Cluster-deployment-segregated.png
new file mode 100644
index 0000000000..a504ae18aa
Binary files /dev/null and b/doc/ha-guide-draft/source/figures/Cluster-deployment-segregated.png differ
diff --git a/doc/ha-guide-draft/source/ha-community.rst b/doc/ha-guide-draft/source/ha-community.rst
new file mode 100644
index 0000000000..cba0598b12
--- /dev/null
+++ b/doc/ha-guide-draft/source/ha-community.rst
@@ -0,0 +1,17 @@
+============
+HA community
+============
+
+The OpenStack HA community holds `weekly IRC meetings
+`_ to discuss
+a range of topics relating to HA in OpenStack. Everyone interested is
+encouraged to attend. The `logs of all previous meetings
+`_ are available to read.
+
+You can contact the HA community directly in `the #openstack-ha
+channel on Freenode IRC `_, or by
+sending mail to the `openstack-dev
+`_
+or `openstack-docs
+`_
+mailing list with the ``[HA]`` prefix in the ``Subject`` header.
diff --git a/doc/ha-guide-draft/source/index.rst b/doc/ha-guide-draft/source/index.rst
new file mode 100644
index 0000000000..87bb04a894
--- /dev/null
+++ b/doc/ha-guide-draft/source/index.rst
@@ -0,0 +1,38 @@
+=================================
+OpenStack High Availability Guide
+=================================
+
+Abstract
+~~~~~~~~
+
+This guide describes how to install and configure OpenStack for high
+availability. It supplements the Installation Tutorials and Guides
+and assumes that you are familiar with the material in those guides.
+
+This guide documents OpenStack Ocata, Newton, and Mitaka releases.
+
+.. warning::
+
+ This guide is a work-in-progress and changing rapidly
+ while we continue to test and enhance the guidance. There are
+ open `TODO` items throughout and available on the OpenStack manuals
+ `bug list `_.
+ Please help where you are able.
+
+.. toctree::
+ :maxdepth: 1
+
+ common/conventions.rst
+ overview.rst
+ intro-ha.rst
+ intro-os-ha.rst
+ control-plane.rst
+ networking-ha.rst
+ storage-ha.rst
+ compute-node-ha.rst
+ monitoring.rst
+ testing.rst
+ ref-arch-examples.rst
+ ha-community.rst
+ common/app-support.rst
+ common/glossary.rst
diff --git a/doc/ha-guide-draft/source/intro-ha-common-tech.rst b/doc/ha-guide-draft/source/intro-ha-common-tech.rst
new file mode 100644
index 0000000000..572804f60a
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-ha-common-tech.rst
@@ -0,0 +1,98 @@
+========================
+Commonly used technology
+========================
+
+Hardware
+~~~~~~~~
+The following are the standard hardware requirements:
+
+- Provider networks: See the *Overview -> Networking Option 1: Provider
+ networks* section of the
+ `Install Tutorials and Guides `_
+ depending on your distribution.
+- Self-service networks: See the *Overview -> Networking Option 2:
+ Self-service networks* section of the
+ `Install Tutorials and Guides `_
+ depending on your distribution.
+
+Load balancers
+--------------
+
+Redundant switches
+------------------
+
+Bonded interfaces
+-----------------
+
+Storage
+-------
+
+Software
+~~~~~~~~
+
+HAproxy
+-------
+
+HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
+for TCP or HTTP applications. It is particularly suited for web crawling
+under very high loads while needing persistence or Layer 7 processing.
+It realistically supports tens of thousands of connections with recent
+hardware.
+
+.. note::
+
+ Ensure your HAProxy installation is not a single point of failure,
+ it is advisable to have multiple HAProxy instances running.
+
+ You can also ensure the availability by other means, using Keepalived
+ or Pacemaker.
+
+Alternatively, you can use a commercial load balancer, which is hardware
+or software. We recommend a hardware load balancer as it generally has
+good performance.
+
+For detailed instructions about installing HAProxy on your nodes,
+see the HAProxy `official documentation `_.
+
+keepalived
+----------
+
+`keepalived `_ is a routing software that
+provides facilities for load balancing and high-availability to Linux
+system and Linux based infrastructures.
+
+Keepalived implements a set of checkers to dynamically and
+adaptively maintain and manage loadbalanced server pool according
+their health.
+
+The keepalived daemon can be used to monitor services or systems and
+to automatically failover to a standby if problems occur.
+
+Pacemaker
+---------
+
+`Pacemaker `_ cluster stack is a state-of-the-art
+high availability and load balancing stack for the Linux platform.
+Pacemaker is used to make OpenStack infrastructure highly available.
+
+Pacemaker relies on the
+`Corosync `_ messaging layer
+for reliable cluster communications. Corosync implements the Totem single-ring
+ordering and membership protocol. It also provides UDP and InfiniBand based
+messaging, quorum, and cluster membership to Pacemaker.
+
+Pacemaker does not inherently understand the applications it manages.
+Instead, it relies on resource agents (RAs) that are scripts that encapsulate
+the knowledge of how to start, stop, and check the health of each application
+managed by the cluster.
+
+These agents must conform to one of the `OCF `_,
+`SysV Init `_, Upstart, or Systemd standards.
+
+Pacemaker ships with a large set of OCF agents (such as those managing
+MySQL databases, virtual IP addresses, and RabbitMQ), but can also use
+any agents already installed on your system and can be extended with
+your own (see the
+`developer guide `_).
diff --git a/doc/ha-guide-draft/source/intro-ha-key-concepts.rst b/doc/ha-guide-draft/source/intro-ha-key-concepts.rst
new file mode 100644
index 0000000000..4a75d53b2c
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-ha-key-concepts.rst
@@ -0,0 +1,147 @@
+============
+Key concepts
+============
+
+Redundancy and failover
+~~~~~~~~~~~~~~~~~~~~~~~
+
+High availability is implemented with redundant hardware
+running redundant instances of each service.
+If one piece of hardware running one instance of a service fails,
+the system can then failover to use another instance of a service
+that is running on hardware that did not fail.
+
+A crucial aspect of high availability
+is the elimination of single points of failure (SPOFs).
+A SPOF is an individual piece of equipment or software
+that causes system downtime or data loss if it fails.
+In order to eliminate SPOFs, check that mechanisms exist for redundancy of:
+
+- Network components, such as switches and routers
+
+- Applications and automatic service migration
+
+- Storage components
+
+- Facility services such as power, air conditioning, and fire protection
+
+In the event that a component fails and a back-up system must take on
+its load, most high availability systems will replace the failed
+component as quickly as possible to maintain necessary redundancy. This
+way time spent in a degraded protection state is minimized.
+
+Most high availability systems fail in the event of multiple
+independent (non-consequential) failures. In this case, most
+implementations favor protecting data over maintaining availability.
+
+High availability systems typically achieve an uptime percentage of
+99.99% or more, which roughly equates to less than an hour of
+cumulative downtime per year. In order to achieve this, high
+availability systems should keep recovery times after a failure to
+about one to two minutes, sometimes significantly less.
+
+OpenStack currently meets such availability requirements for its own
+infrastructure services, meaning that an uptime of 99.99% is feasible
+for the OpenStack infrastructure proper. However, OpenStack does not
+guarantee 99.99% availability for individual guest instances.
+
+This document discusses some common methods of implementing highly
+available systems, with an emphasis on the core OpenStack services and
+other open source services that are closely aligned with OpenStack.
+
+You will need to address high availability concerns for any applications
+software that you run on your OpenStack environment. The important thing is
+to make sure that your services are redundant and available.
+How you achieve that is up to you.
+
+Active/passive versus active/active
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Stateful services can be configured as active/passive or active/active,
+which are defined as follows:
+
+:term:`active/passive configuration`
+ Maintains a redundant instance
+ that can be brought online when the active service fails.
+ For example, OpenStack writes to the main database
+ while maintaining a disaster recovery database that can be brought online
+ if the main database fails.
+
+ A typical active/passive installation for a stateful service maintains
+ a replacement resource that can be brought online when required.
+ Requests are handled using a :term:`virtual IP address (VIP)` that
+ facilitates returning to service with minimal reconfiguration.
+ A separate application (such as Pacemaker or Corosync) monitors
+ these services, bringing the backup online as necessary.
+
+:term:`active/active configuration`
+ Each service also has a backup but manages both the main and
+ redundant systems concurrently.
+ This way, if there is a failure, the user is unlikely to notice.
+ The backup system is already online and takes on increased load
+ while the main system is fixed and brought back online.
+
+ Typically, an active/active installation for a stateless service
+ maintains a redundant instance, and requests are load balanced using
+ a virtual IP address and a load balancer such as HAProxy.
+
+ A typical active/active installation for a stateful service includes
+ redundant services, with all instances having an identical state. In
+ other words, updates to one instance of a database update all other
+ instances. This way a request to one instance is the same as a
+ request to any other. A load balancer manages the traffic to these
+ systems, ensuring that operational systems always handle the
+ request.
+
+Clusters and quorums
+~~~~~~~~~~~~~~~~~~~~
+
+The quorum specifies the minimal number of nodes
+that must be functional in a cluster of redundant nodes
+in order for the cluster to remain functional.
+When one node fails and failover transfers control to other nodes,
+the system must ensure that data and processes remain sane.
+To determine this, the contents of the remaining nodes are compared
+and, if there are discrepancies, a majority rules algorithm is implemented.
+
+For this reason, each cluster in a high availability environment should
+have an odd number of nodes and the quorum is defined as more than a half
+of the nodes.
+If multiple nodes fail so that the cluster size falls below the quorum
+value, the cluster itself fails.
+
+For example, in a seven-node cluster, the quorum should be set to
+``floor(7/2) + 1 == 4``. If quorum is four and four nodes fail simultaneously,
+the cluster itself would fail, whereas it would continue to function, if
+no more than three nodes fail. If split to partitions of three and four nodes
+respectively, the quorum of four nodes would continue to operate the majority
+partition and stop or fence the minority one (depending on the
+no-quorum-policy cluster configuration).
+
+And the quorum could also have been set to three, just as a configuration
+example.
+
+.. note::
+
+ We do not recommend setting the quorum to a value less than ``floor(n/2) + 1``
+ as it would likely cause a split-brain in a face of network partitions.
+
+When four nodes fail simultaneously, the cluster would continue to function as
+well. But if split to partitions of three and four nodes respectively, the
+quorum of three would have made both sides to attempt to fence the other and
+host resources. Without fencing enabled, it would go straight to running
+two copies of each resource.
+
+This is why setting the quorum to a value less than ``floor(n/2) + 1`` is
+dangerous. However it may be required for some specific cases, such as a
+temporary measure at a point it is known with 100% certainty that the other
+nodes are down.
+
+When configuring an OpenStack environment for study or demonstration purposes,
+it is possible to turn off the quorum checking. Production systems should
+always run with quorum enabled.
+
+Load balancing
+~~~~~~~~~~~~~~
+
+.. to do: definition and description of need within HA
diff --git a/doc/ha-guide-draft/source/intro-ha.rst b/doc/ha-guide-draft/source/intro-ha.rst
new file mode 100644
index 0000000000..d798c46c3c
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-ha.rst
@@ -0,0 +1,24 @@
+=================================
+Introduction to high availability
+=================================
+
+High availability systems seek to minimize the following issues:
+
+#. System downtime: Occurs when a user-facing service is unavailable
+ beyond a specified maximum amount of time.
+
+#. Data loss: Accidental deletion or destruction of data.
+
+Most high availability systems guarantee protection against system downtime
+and data loss only in the event of a single failure.
+However, they are also expected to protect against cascading failures,
+where a single failure deteriorates into a series of consequential failures.
+Many service providers guarantee a :term:`Service Level Agreement (SLA)`
+including uptime percentage of computing service, which is calculated based
+on the available time and system downtime excluding planned outage time.
+
+.. toctree::
+ :maxdepth: 2
+
+ intro-ha-key-concepts.rst
+ intro-ha-common-tech.rst
diff --git a/doc/ha-guide-draft/source/intro-os-ha-cluster.rst b/doc/ha-guide-draft/source/intro-os-ha-cluster.rst
new file mode 100644
index 0000000000..555ee2631d
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha-cluster.rst
@@ -0,0 +1,67 @@
+================
+Cluster managers
+================
+
+At its core, a cluster is a distributed finite state machine capable
+of co-ordinating the startup and recovery of inter-related services
+across a set of machines.
+
+Even a distributed or replicated application that is able to survive failures
+on one or more machines can benefit from a cluster manager because a cluster
+manager has the following capabilities:
+
+#. Awareness of other applications in the stack
+
+ While SYS-V init replacements like systemd can provide
+ deterministic recovery of a complex stack of services, the
+ recovery is limited to one machine and lacks the context of what
+ is happening on other machines. This context is crucial to
+ determine the difference between a local failure, and clean startup
+ and recovery after a total site failure.
+
+#. Awareness of instances on other machines
+
+ Services like RabbitMQ and Galera have complicated boot-up
+ sequences that require co-ordination, and often serialization, of
+ startup operations across all machines in the cluster. This is
+ especially true after a site-wide failure or shutdown where you must
+ first determine the last machine to be active.
+
+#. A shared implementation and calculation of `quorum
+ `_
+
+ It is very important that all members of the system share the same
+ view of who their peers are and whether or not they are in the
+ majority. Failure to do this leads very quickly to an internal
+ `split-brain `_
+ state. This is where different parts of the system are pulling in
+ different and incompatible directions.
+
+#. Data integrity through fencing (a non-responsive process does not
+ imply it is not doing anything)
+
+ A single application does not have sufficient context to know the
+ difference between failure of a machine and failure of the
+ application on a machine. The usual practice is to assume the
+ machine is dead and continue working, however this is highly risky. A
+ rogue process or machine could still be responding to requests and
+ generally causing havoc. The safer approach is to make use of
+ remotely accessible power switches and/or network switches and SAN
+ controllers to fence (isolate) the machine before continuing.
+
+#. Automated recovery of failed instances
+
+ While the application can still run after the failure of several
+ instances, it may not have sufficient capacity to serve the
+ required volume of requests. A cluster can automatically recover
+ failed instances to prevent additional load induced failures.
+
+Pacemaker
+~~~~~~~~~
+.. to do: description and point to ref arch example using pacemaker
+
+`Pacemaker `_.
+
+Systemd
+~~~~~~~
+.. to do: description and point to ref arch example using Systemd and link
diff --git a/doc/ha-guide-draft/source/intro-os-ha-memcached.rst b/doc/ha-guide-draft/source/intro-os-ha-memcached.rst
new file mode 100644
index 0000000000..709c891199
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha-memcached.rst
@@ -0,0 +1,35 @@
+=========
+Memcached
+=========
+
+Most OpenStack services can use Memcached to store ephemeral data such as
+tokens. Although Memcached does not support typical forms of redundancy such
+as clustering, OpenStack services can use almost any number of instances
+by configuring multiple hostnames or IP addresses.
+
+The Memcached client implements hashing to balance objects among the instances.
+Failure of an instance only impacts a percentage of the objects,
+and the client automatically removes it from the list of instances.
+
+Installation
+~~~~~~~~~~~~
+
+To install and configure Memcached, read the
+`official documentation `_.
+
+Memory caching is managed by `oslo.cache
+`_.
+This ensures consistency across all projects when using multiple Memcached
+servers. The following is an example configuration with three hosts:
+
+.. code-block:: ini
+
+ Memcached_servers = controller1:11211,controller2:11211,controller3:11211
+
+By default, ``controller1`` handles the caching service. If the host goes down,
+``controller2`` or ``controller3`` will complete the service.
+
+For more information about Memcached installation, see the
+*Environment -> Memcached* section in the
+`Installation Tutorials and Guides `_
+depending on your distribution.
diff --git a/doc/ha-guide-draft/source/intro-os-ha-state.rst b/doc/ha-guide-draft/source/intro-os-ha-state.rst
new file mode 100644
index 0000000000..ba7703e844
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha-state.rst
@@ -0,0 +1,52 @@
+==================================
+Stateless versus stateful services
+==================================
+
+OpenStack components can be divided into three categories:
+
+- OpenStack APIs: APIs that are HTTP(s) stateless services written in python,
+ easy to duplicate and mostly easy to load balance.
+
+- The SQL relational database server provides stateful type consumed by other
+ components. Supported databases are MySQL, MariaDB, and PostgreSQL.
+ Making the SQL database redundant is complex.
+
+- :term:`Advanced Message Queuing Protocol (AMQP)` provides OpenStack
+ internal stateful communication service.
+
+.. to do: Ensure the difference between stateless and stateful services
+.. is clear
+
+Stateless services
+~~~~~~~~~~~~~~~~~~
+
+A service that provides a response after your request and then
+requires no further attention. To make a stateless service highly
+available, you need to provide redundant instances and load balance them.
+
+Stateless OpenStack services
+----------------------------
+
+OpenStack services that are stateless include ``nova-api``,
+``nova-conductor``, ``glance-api``, ``keystone-api``, ``neutron-api``,
+and ``nova-scheduler``.
+
+Stateful services
+~~~~~~~~~~~~~~~~~
+
+A service where subsequent requests to the service
+depend on the results of the first request.
+Stateful services are more difficult to manage because a single
+action typically involves more than one request. Providing
+additional instances and load balancing does not solve the problem.
+For example, if the horizon user interface reset itself every time
+you went to a new page, it would not be very useful.
+OpenStack services that are stateful include the OpenStack database
+and message queue.
+Making stateful services highly available can depend on whether you choose
+an active/passive or active/active configuration.
+
+Stateful OpenStack services
+----------------------------
+
+.. to do: create list of stateful services
diff --git a/doc/ha-guide-draft/source/intro-os-ha.rst b/doc/ha-guide-draft/source/intro-os-ha.rst
new file mode 100644
index 0000000000..5613122aed
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha.rst
@@ -0,0 +1,12 @@
+================================================
+Introduction to high availability with OpenStack
+================================================
+
+.. to do: description of section & improvement of title (intro to OS HA)
+
+.. toctree::
+ :maxdepth: 2
+
+ intro-os-ha-state.rst
+ intro-os-ha-cluster.rst
+ intro-os-ha-memcached.rst
diff --git a/doc/ha-guide-draft/source/monitoring.rst b/doc/ha-guide-draft/source/monitoring.rst
new file mode 100644
index 0000000000..a1b132774f
--- /dev/null
+++ b/doc/ha-guide-draft/source/monitoring.rst
@@ -0,0 +1,6 @@
+==========
+Monitoring
+==========
+
+
+
diff --git a/doc/ha-guide-draft/source/networking-ha-l3-agent.rst b/doc/ha-guide-draft/source/networking-ha-l3-agent.rst
new file mode 100644
index 0000000000..5a6370ae1c
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha-l3-agent.rst
@@ -0,0 +1,20 @@
+========
+L3 Agent
+========
+.. TODO: Introduce L3 agent
+
+HA Routers
+~~~~~~~~~~
+.. TODO: content for HA routers
+
+Networking DHCP agent
+~~~~~~~~~~~~~~~~~~~~~
+The OpenStack Networking (neutron) service has a scheduler that lets you run
+multiple agents across nodes. The DHCP agent can be natively highly available.
+
+To configure the number of DHCP agents per network, modify the
+``dhcp_agents_per_network`` parameter in the :file:`/etc/neutron/neutron.conf`
+file. By default this is set to 1. To achieve high availability, assign more
+than one DHCP agent per network. For more information, see
+`High-availability for DHCP
+`_.
diff --git a/doc/ha-guide-draft/source/networking-ha-neutron-l3-analysis.rst b/doc/ha-guide-draft/source/networking-ha-neutron-l3-analysis.rst
new file mode 100644
index 0000000000..7a803132f7
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha-neutron-l3-analysis.rst
@@ -0,0 +1,6 @@
+==========
+Neutron L3
+==========
+
+.. TODO: create and import Neutron L3 analysis
+ Introduce the Networking (neutron) service L3 agent
diff --git a/doc/ha-guide-draft/source/networking-ha-neutron-server.rst b/doc/ha-guide-draft/source/networking-ha-neutron-server.rst
new file mode 100644
index 0000000000..646eb66aa4
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha-neutron-server.rst
@@ -0,0 +1,5 @@
+=========================
+Neutron Networking server
+=========================
+
+.. TODO: Create content similar to other API sections
diff --git a/doc/ha-guide-draft/source/networking-ha.rst b/doc/ha-guide-draft/source/networking-ha.rst
new file mode 100644
index 0000000000..0767455056
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha.rst
@@ -0,0 +1,29 @@
+===================================
+Configuring the networking services
+===================================
+
+Configure networking on each node. See the basic information about
+configuring networking in the Networking service section of the
+`Install Tutorials and Guides `_,
+depending on your distribution.
+
+OpenStack network nodes contain:
+
+- Networking DHCP agent
+- Neutron L3 agent
+- Networking L2 agent
+
+.. note::
+
+ The L2 agent cannot be distributed and highly available. Instead, it
+ must be installed on each data forwarding node to control the virtual
+ network driver such as Open vSwitch or Linux Bridge. One L2 agent runs
+ per node and controls its virtual interfaces.
+
+.. toctree::
+ :maxdepth: 2
+
+ networking-ha-neutron-server.rst
+ networking-ha-neutron-l3-analysis.rst
+ networking-ha-l3-agent.rst
+
diff --git a/doc/ha-guide-draft/source/overview.rst b/doc/ha-guide-draft/source/overview.rst
new file mode 100644
index 0000000000..7b64054e21
--- /dev/null
+++ b/doc/ha-guide-draft/source/overview.rst
@@ -0,0 +1,24 @@
+========
+Overview
+========
+
+This guide can be split into two parts:
+
+#. High level architecture
+#. Reference architecture examples, monitoring, and testing
+
+.. warning::
+ We recommend using this guide for assistance when considering your HA cloud.
+ We do not recommend using this guide for manually building your HA cloud.
+ We recommend starting with a pre-validated solution and adjusting to your
+ needs.
+
+High availability is not for every user. It presents some challenges.
+High availability may be too complex for databases or
+systems with large amounts of data. Replication can slow large systems
+down. Different setups have different prerequisites. Read the guidelines
+for each setup.
+
+.. important::
+
+ High availability is turned off as the default in OpenStack setups.
diff --git a/doc/ha-guide-draft/source/ref-arch-examples.rst b/doc/ha-guide-draft/source/ref-arch-examples.rst
new file mode 100644
index 0000000000..dc842f3f53
--- /dev/null
+++ b/doc/ha-guide-draft/source/ref-arch-examples.rst
@@ -0,0 +1,3 @@
+======================
+Reference Architecture
+======================
diff --git a/doc/ha-guide-draft/source/storage-ha-backend.rst b/doc/ha-guide-draft/source/storage-ha-backend.rst
new file mode 100644
index 0000000000..8148b5287a
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-backend.rst
@@ -0,0 +1,59 @@
+
+.. _storage-ha-backend:
+
+================
+Storage back end
+================
+
+An OpenStack environment includes multiple data pools for the VMs:
+
+- Ephemeral storage is allocated for an instance and is deleted when the
+ instance is deleted. The Compute service manages ephemeral storage and
+ by default, Compute stores ephemeral drives as files on local disks on the
+ compute node. As an alternative, you can use Ceph RBD as the storage back
+ end for ephemeral storage.
+
+- Persistent storage exists outside all instances. Two types of persistent
+ storage are provided:
+
+ - The Block Storage service (cinder) that can use LVM or Ceph RBD as the
+ storage back end.
+ - The Image service (glance) that can use the Object Storage service (swift)
+ or Ceph RBD as the storage back end.
+
+For more information about configuring storage back ends for
+the different storage options, see `Manage volumes
+`_
+in the OpenStack Administrator Guide.
+
+This section discusses ways to protect against data loss in your OpenStack
+environment.
+
+RAID drives
+-----------
+
+Configuring RAID on the hard drives that implement storage protects your data
+against a hard drive failure. If the node itself fails, data may be lost.
+In particular, all volumes stored on an LVM node can be lost.
+
+Ceph
+----
+
+`Ceph RBD `_ is an innately high availability storage back
+end. It creates a storage cluster with multiple nodes that communicate with
+each other to replicate and redistribute data dynamically.
+A Ceph RBD storage cluster provides a single shared set of storage nodes that
+can handle all classes of persistent and ephemeral data (glance, cinder, and
+nova) that are required for OpenStack instances.
+
+Ceph RBD provides object replication capabilities by storing Block Storage
+volumes as Ceph RBD objects. Ceph RBD ensures that each replica of an object
+is stored on a different node. This means that your volumes are protected
+against hard drive and node failures, or even the failure of the data center
+itself.
+
+When Ceph RBD is used for ephemeral volumes as well as block and image storage,
+it supports `live migration
+`_
+of VMs with ephemeral drives. LVM only supports live migration of
+volume-backed VMs.
diff --git a/doc/ha-guide-draft/source/storage-ha-block.rst b/doc/ha-guide-draft/source/storage-ha-block.rst
new file mode 100644
index 0000000000..a9000cbfa5
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-block.rst
@@ -0,0 +1,190 @@
+==================================
+Highly available Block Storage API
+==================================
+
+Cinder provides Block-Storage-as-a-Service suitable for performance
+sensitive scenarios such as databases, expandable file systems, or
+providing a server with access to raw block level storage.
+
+Persistent block storage can survive instance termination and can also
+be moved across instances like any external storage device. Cinder
+also has volume snapshots capability for backing up the volumes.
+
+Making the Block Storage API service highly available in
+active/passive mode involves:
+
+- :ref:`ha-blockstorage-pacemaker`
+- :ref:`ha-blockstorage-configure`
+- :ref:`ha-blockstorage-services`
+
+In theory, you can run the Block Storage service as active/active.
+However, because of sufficient concerns, we recommend running
+the volume component as active/passive only.
+
+You can read more about these concerns on the
+`Red Hat Bugzilla `_
+and there is a
+`psuedo roadmap `_
+for addressing them upstream.
+
+.. _ha-blockstorage-pacemaker:
+
+Add Block Storage API resource to Pacemaker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On RHEL-based systems, create resources for cinder's systemd agents and create
+constraints to enforce startup/shutdown ordering:
+
+.. code-block:: console
+
+ pcs resource create openstack-cinder-api systemd:openstack-cinder-api --clone interleave=true
+ pcs resource create openstack-cinder-scheduler systemd:openstack-cinder-scheduler --clone interleave=true
+ pcs resource create openstack-cinder-volume systemd:openstack-cinder-volume
+
+ pcs constraint order start openstack-cinder-api-clone then openstack-cinder-scheduler-clone
+ pcs constraint colocation add openstack-cinder-scheduler-clone with openstack-cinder-api-clone
+ pcs constraint order start openstack-cinder-scheduler-clone then openstack-cinder-volume
+ pcs constraint colocation add openstack-cinder-volume with openstack-cinder-scheduler-clone
+
+
+If the Block Storage service runs on the same nodes as the other services,
+then it is advisable to also include:
+
+.. code-block:: console
+
+ pcs constraint order start openstack-keystone-clone then openstack-cinder-api-clone
+
+Alternatively, instead of using systemd agents, download and
+install the OCF resource agent:
+
+.. code-block:: console
+
+ # cd /usr/lib/ocf/resource.d/openstack
+ # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/cinder-api
+ # chmod a+rx *
+
+You can now add the Pacemaker configuration for Block Storage API resource.
+Connect to the Pacemaker cluster with the :command:`crm configure` command
+and add the following cluster resources:
+
+.. code-block:: none
+
+ primitive p_cinder-api ocf:openstack:cinder-api \
+ params config="/etc/cinder/cinder.conf" \
+ os_password="secretsecret" \
+ os_username="admin" \
+ os_tenant_name="admin" \
+ keystone_get_token_url="http://10.0.0.11:5000/v2.0/tokens" \
+ op monitor interval="30s" timeout="30s"
+
+This configuration creates ``p_cinder-api``, a resource for managing the
+Block Storage API service.
+
+The command :command:`crm configure` supports batch input, copy and paste the
+lines above into your live Pacemaker configuration and then make changes as
+required. For example, you may enter ``edit p_ip_cinder-api`` from the
+:command:`crm configure` menu and edit the resource to match your preferred
+virtual IP address.
+
+Once completed, commit your configuration changes by entering :command:`commit`
+from the :command:`crm configure` menu. Pacemaker then starts the Block Storage
+API service and its dependent resources on one of your nodes.
+
+.. _ha-blockstorage-configure:
+
+Configure Block Storage API service
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Edit the ``/etc/cinder/cinder.conf`` file. For example, on a RHEL-based system:
+
+.. code-block:: ini
+ :linenos:
+
+ [DEFAULT]
+ # This is the name which we should advertise ourselves as and for
+ # A/P installations it should be the same everywhere
+ host = cinder-cluster-1
+
+ # Listen on the Block Storage VIP
+ osapi_volume_listen = 10.0.0.11
+
+ auth_strategy = keystone
+ control_exchange = cinder
+
+ volume_driver = cinder.volume.drivers.nfs.NfsDriver
+ nfs_shares_config = /etc/cinder/nfs_exports
+ nfs_sparsed_volumes = true
+ nfs_mount_options = v3
+
+ [database]
+ sql_connection = mysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
+ max_retries = -1
+
+ [keystone_authtoken]
+ # 10.0.0.11 is the Keystone VIP
+ identity_uri = http://10.0.0.11:35357/
+ auth_uri = http://10.0.0.11:5000/
+ admin_tenant_name = service
+ admin_user = cinder
+ admin_password = CINDER_PASS
+
+ [oslo_messaging_rabbit]
+ # Explicitly list the rabbit hosts as it doesn't play well with HAProxy
+ rabbit_hosts = 10.0.0.12,10.0.0.13,10.0.0.14
+ # As a consequence, we also need HA queues
+ rabbit_ha_queues = True
+ heartbeat_timeout_threshold = 60
+ heartbeat_rate = 2
+
+Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
+database. Replace ``CINDER_PASS`` with the password you chose for the
+``cinder`` user in the Identity service.
+
+This example assumes that you are using NFS for the physical storage, which
+will almost never be true in a production installation.
+
+If you are using the Block Storage service OCF agent, some settings will
+be filled in for you, resulting in a shorter configuration file:
+
+.. code-block:: ini
+ :linenos:
+
+ # We have to use MySQL connection to store data:
+ sql_connection = mysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
+ # Alternatively, you can switch to pymysql,
+ # a new Python 3 compatible library and use
+ # sql_connection = mysql+pymysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
+ # and be ready when everything moves to Python 3.
+ # Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
+
+ # We bind Block Storage API to the VIP:
+ osapi_volume_listen = 10.0.0.11
+
+ # We send notifications to High Available RabbitMQ:
+ notifier_strategy = rabbit
+ rabbit_host = 10.0.0.11
+
+Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
+database.
+
+.. _ha-blockstorage-services:
+
+Configure OpenStack services to use the highly available Block Storage API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your OpenStack services must now point their Block Storage API configuration
+to the highly available, virtual cluster IP address rather than a Block Storage
+API server’s physical IP address as you would for a non-HA environment.
+
+Create the Block Storage API endpoint with this IP.
+
+If you are using both private and public IP addresses, create two virtual IPs
+and define your endpoint. For example:
+
+.. code-block:: console
+
+ $ openstack endpoint create volume --region $KEYSTONE_REGION \
+ --publicurl 'http://PUBLIC_VIP:8776/v1/%(tenant_id)s' \
+ --adminurl 'http://10.0.0.11:8776/v1/%(tenant_id)s' \
+ --internalurl 'http://10.0.0.11:8776/v1/%(tenant_id)s'
+
diff --git a/doc/ha-guide-draft/source/storage-ha-file-systems.rst b/doc/ha-guide-draft/source/storage-ha-file-systems.rst
new file mode 100644
index 0000000000..5ef3e2e8a5
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-file-systems.rst
@@ -0,0 +1,114 @@
+========================================
+Highly available Shared File Systems API
+========================================
+
+Making the Shared File Systems (manila) API service highly available
+in active/passive mode involves:
+
+- :ref:`ha-sharedfilesystems-configure`
+- :ref:`ha-sharedfilesystems-services`
+- :ref:`ha-sharedfilesystems-pacemaker`
+
+.. _ha-sharedfilesystems-configure:
+
+Configure Shared File Systems API service
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Edit the :file:`/etc/manila/manila.conf` file:
+
+.. code-block:: ini
+ :linenos:
+
+ # We have to use MySQL connection to store data:
+ sql_connection = mysql+pymysql://manila:password@10.0.0.11/manila?charset=utf8
+
+ # We bind Shared File Systems API to the VIP:
+ osapi_volume_listen = 10.0.0.11
+
+ # We send notifications to High Available RabbitMQ:
+ notifier_strategy = rabbit
+ rabbit_host = 10.0.0.11
+
+
+.. _ha-sharedfilesystems-services:
+
+Configure OpenStack services to use Shared File Systems API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your OpenStack services must now point their Shared File Systems API
+configuration to the highly available, virtual cluster IP address rather than
+a Shared File Systems API server’s physical IP address as you would
+for a non-HA environment.
+
+You must create the Shared File Systems API endpoint with this IP.
+
+If you are using both private and public IP addresses, you should create two
+virtual IPs and define your endpoints like this:
+
+.. code-block:: console
+
+ $ openstack endpoint create --region RegionOne \
+ sharev2 public 'http://PUBLIC_VIP:8786/v2/%(tenant_id)s'
+
+ $ openstack endpoint create --region RegionOne \
+ sharev2 internal 'http://10.0.0.11:8786/v2/%(tenant_id)s'
+
+ $ openstack endpoint create --region RegionOne \
+ sharev2 admin 'http://10.0.0.11:8786/v2/%(tenant_id)s'
+
+.. _ha-sharedfilesystems-pacemaker:
+
+Add Shared File Systems API resource to Pacemaker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Download the resource agent to your system:
+
+ .. code-block:: console
+
+ # cd /usr/lib/ocf/resource.d/openstack
+ # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/manila-api
+ # chmod a+rx *
+
+#. Add the Pacemaker configuration for the Shared File Systems
+ API resource. Connect to the Pacemaker cluster with the following
+ command:
+
+ .. code-block:: console
+
+ # crm configure
+
+ .. note::
+
+ The :command:`crm configure` supports batch input. Copy and paste
+ the lines in the next step into your live Pacemaker configuration and then
+ make changes as required.
+
+ For example, you may enter ``edit p_ip_manila-api`` from the
+ :command:`crm configure` menu and edit the resource to match your preferred
+ virtual IP address.
+
+#. Add the following cluster resources:
+
+ .. code-block:: none
+
+ primitive p_manila-api ocf:openstack:manila-api \
+ params config="/etc/manila/manila.conf" \
+ os_password="secretsecret" \
+ os_username="admin" \
+ os_tenant_name="admin" \
+ keystone_get_token_url="http://10.0.0.11:5000/v2.0/tokens" \
+ op monitor interval="30s" timeout="30s"
+
+ This configuration creates ``p_manila-api``, a resource for managing the
+ Shared File Systems API service.
+
+#. Commit your configuration changes by entering the following command
+ from the :command:`crm configure` menu:
+
+ .. code-block:: console
+
+ # commit
+
+Pacemaker now starts the Shared File Systems API service and its
+dependent resources on one of your nodes.
+
diff --git a/doc/ha-guide-draft/source/storage-ha-image.rst b/doc/ha-guide-draft/source/storage-ha-image.rst
new file mode 100644
index 0000000000..362c65c5ac
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-image.rst
@@ -0,0 +1,141 @@
+==========================
+Highly available Image API
+==========================
+
+The OpenStack Image service offers a service for discovering, registering, and
+retrieving virtual machine images. To make the OpenStack Image API service
+highly available in active/passive mode, you must:
+
+- :ref:`glance-api-pacemaker`
+- :ref:`glance-api-configure`
+- :ref:`glance-services`
+
+Prerequisites
+~~~~~~~~~~~~~
+
+Before beginning, ensure that you are familiar with the
+documentation for installing the OpenStack Image API service.
+See the *Image service* section in the
+`Installation Tutorials and Guides `_,
+depending on your distribution.
+
+.. _glance-api-pacemaker:
+
+Add OpenStack Image API resource to Pacemaker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Download the resource agent to your system:
+
+ .. code-block:: console
+
+ # cd /usr/lib/ocf/resource.d/openstack
+ # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/glance-api
+ # chmod a+rx *
+
+#. Add the Pacemaker configuration for the OpenStack Image API resource.
+ Use the following command to connect to the Pacemaker cluster:
+
+ .. code-block:: console
+
+ crm configure
+
+ .. note::
+
+ The :command:`crm configure` command supports batch input. Copy and paste
+ the lines in the next step into your live Pacemaker configuration and
+ then make changes as required.
+
+ For example, you may enter ``edit p_ip_glance-api`` from the
+ :command:`crm configure` menu and edit the resource to match your
+ preferred virtual IP address.
+
+#. Add the following cluster resources:
+
+ .. code-block:: console
+
+ primitive p_glance-api ocf:openstack:glance-api \
+ params config="/etc/glance/glance-api.conf" \
+ os_password="secretsecret" \
+ os_username="admin" os_tenant_name="admin" \
+ os_auth_url="http://10.0.0.11:5000/v2.0/" \
+ op monitor interval="30s" timeout="30s"
+
+ This configuration creates ``p_glance-api``, a resource for managing the
+ OpenStack Image API service.
+
+#. Commit your configuration changes by entering the following command from
+ the :command:`crm configure` menu:
+
+ .. code-block:: console
+
+ commit
+
+Pacemaker then starts the OpenStack Image API service and its dependent
+resources on one of your nodes.
+
+.. _glance-api-configure:
+
+Configure OpenStack Image service API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Edit the :file:`/etc/glance/glance-api.conf` file
+to configure the OpenStack Image service:
+
+.. code-block:: ini
+
+ # We have to use MySQL connection to store data:
+ sql_connection=mysql://glance:password@10.0.0.11/glance
+ # Alternatively, you can switch to pymysql,
+ # a new Python 3 compatible library and use
+ # sql_connection=mysql+pymysql://glance:password@10.0.0.11/glance
+ # and be ready when everything moves to Python 3.
+ # Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
+
+ # We bind OpenStack Image API to the VIP:
+ bind_host = 10.0.0.11
+
+ # Connect to OpenStack Image registry service:
+ registry_host = 10.0.0.11
+
+ # We send notifications to High Available RabbitMQ:
+ notifier_strategy = rabbit
+ rabbit_host = 10.0.0.11
+
+[TODO: need more discussion of these parameters]
+
+.. _glance-services:
+
+Configure OpenStack services to use the highly available OpenStack Image API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your OpenStack services must now point their OpenStack Image API configuration
+to the highly available, virtual cluster IP address instead of pointing to the
+physical IP address of an OpenStack Image API server as you would in a non-HA
+cluster.
+
+For example, if your OpenStack Image API service IP address is 10.0.0.11
+(as in the configuration explained here), you would use the following
+configuration in your :file:`nova.conf` file:
+
+.. code-block:: ini
+
+ [glance]
+ # ...
+ api_servers = 10.0.0.11
+ # ...
+
+
+You must also create the OpenStack Image API endpoint with this IP address.
+If you are using both private and public IP addresses, create two virtual IP
+addresses and define your endpoint. For example:
+
+.. code-block:: console
+
+ $ openstack endpoint create --region $KEYSTONE_REGION \
+ image public http://PUBLIC_VIP:9292
+
+ $ openstack endpoint create --region $KEYSTONE_REGION \
+ image admin http://10.0.0.11:9292
+
+ $ openstack endpoint create --region $KEYSTONE_REGION \
+ image internal http://10.0.0.11:9292
diff --git a/doc/ha-guide-draft/source/storage-ha.rst b/doc/ha-guide-draft/source/storage-ha.rst
new file mode 100644
index 0000000000..22ea30c492
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha.rst
@@ -0,0 +1,22 @@
+===================
+Configuring storage
+===================
+
+.. toctree::
+ :maxdepth: 2
+
+ storage-ha-image.rst
+ storage-ha-block.rst
+ storage-ha-file-systems.rst
+ storage-ha-backend.rst
+
+Making the Block Storage (cinder) API service highly available in
+active/active mode involves:
+
+* Configuring Block Storage to listen on the VIP address
+
+* Managing the Block Storage API daemon with the Pacemaker cluster manager
+
+* Configuring OpenStack services to use this IP address
+
+.. To Do: HA without Pacemaker
diff --git a/doc/ha-guide-draft/source/testing.rst b/doc/ha-guide-draft/source/testing.rst
new file mode 100644
index 0000000000..3cb8110302
--- /dev/null
+++ b/doc/ha-guide-draft/source/testing.rst
@@ -0,0 +1,6 @@
+=======
+Testing
+=======
+
+
+
diff --git a/tools/build-all-rst.sh b/tools/build-all-rst.sh
index beb747ac6e..e4571abdaf 100755
--- a/tools/build-all-rst.sh
+++ b/tools/build-all-rst.sh
@@ -33,7 +33,7 @@ PDF_TARGETS=( 'arch-design'\
# Note that these guides are only build for master branch
for guide in admin-guide arch-design cli-reference contributor-guide \
- ha-guide image-guide ops-guide user-guide; do
+ ha-guide ha-guide-draft image-guide ops-guide user-guide; do
if [[ ${PDF_TARGETS[*]} =~ $guide ]]; then
tools/build-rst.sh doc/$guide --build build \
--target $guide $LINKCHECK $PDF_OPTION
@@ -46,7 +46,7 @@ done
# Draft guides
# This includes guides that we publish from stable branches
# as versioned like the networking-guide.
-for guide in networking-guide config-reference; do
+for guide in ha-guide-draft networking-guide config-reference; do
if [[ ${PDF_TARGETS[*]} =~ $guide ]]; then
tools/build-rst.sh doc/$guide --build build \
--target "draft/$guide" $LINKCHECK $PDF_OPTION
diff --git a/www/draft/draft-index.html b/www/draft/draft-index.html
index 30071f41f3..cc6d5ac233 100644
--- a/www/draft/draft-index.html
+++ b/www/draft/draft-index.html
@@ -77,6 +77,7 @@