diff --git a/doc-tools-check-languages.conf b/doc-tools-check-languages.conf
index 9a25534d4e..4d08f4ba13 100644
--- a/doc-tools-check-languages.conf
+++ b/doc-tools-check-languages.conf
@@ -55,6 +55,7 @@ declare -A SPECIAL_BOOKS=(
     ["config-reference"]="skip"
     ["contributor-guide"]="skip"
     ["releasenotes"]="skip"
+    ["ha-guide-draft"]="skip"
     # Skip old arch design, will be archived
     ["arch-design-to-archive"]="skip"
 )
diff --git a/doc/ha-guide-draft/setup.cfg b/doc/ha-guide-draft/setup.cfg
new file mode 100644
index 0000000000..44d048a0e1
--- /dev/null
+++ b/doc/ha-guide-draft/setup.cfg
@@ -0,0 +1,27 @@
+[metadata]
+name = openstackhaguide
+summary = OpenStack High Availability Guide
+author = OpenStack
+author-email = openstack-docs@lists.openstack.org
+home-page = https://docs.openstack.org/
+classifier =
+Environment :: OpenStack
+Intended Audience :: Information Technology
+Intended Audience :: System Administrators
+License :: OSI Approved :: Apache Software License
+Operating System :: POSIX :: Linux
+Topic :: Documentation
+
+[global]
+setup-hooks =
+    pbr.hooks.setup_hook
+
+[files]
+
+[build_sphinx]
+warning-is-error = 1
+build-dir = build
+source-dir = source
+
+[wheel]
+universal = 1
diff --git a/doc/ha-guide-draft/setup.py b/doc/ha-guide-draft/setup.py
new file mode 100644
index 0000000000..736375744d
--- /dev/null
+++ b/doc/ha-guide-draft/setup.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# Copyright (c) 2013 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# THIS FILE IS MANAGED BY THE GLOBAL REQUIREMENTS REPO - DO NOT EDIT
+import setuptools
+
+# In python < 2.7.4, a lazy loading of package `pbr` will break
+# setuptools if some other modules registered functions in `atexit`.
+# solution from: http://bugs.python.org/issue15881#msg170215
+try:
+    import multiprocessing  # noqa
+except ImportError:
+    pass
+
+setuptools.setup(
+    setup_requires=['pbr'],
+    pbr=True)
diff --git a/doc/ha-guide-draft/source/common b/doc/ha-guide-draft/source/common
new file mode 120000
index 0000000000..dc879abe93
--- /dev/null
+++ b/doc/ha-guide-draft/source/common
@@ -0,0 +1 @@
+../../common
\ No newline at end of file
diff --git a/doc/ha-guide-draft/source/compute-node-ha.rst b/doc/ha-guide-draft/source/compute-node-ha.rst
new file mode 100644
index 0000000000..b1fb659269
--- /dev/null
+++ b/doc/ha-guide-draft/source/compute-node-ha.rst
@@ -0,0 +1,55 @@
+============================
+Configuring the compute node
+============================
+
+The `Installation Tutorials and Guides
+<https://docs.openstack.org/project-install-guide/ocata/>`_
+provide instructions for installing multiple compute nodes.
+To make the compute nodes highly available, you must configure the
+environment to include multiple instances of the API and other services.
+
+Configuring high availability for instances
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As of September 2016, the OpenStack High Availability community is
+designing and developing an official and unified way to provide high
+availability for instances. We are developing automatic
+recovery from failures of hardware or hypervisor-related software on
+the compute node, or other failures that could prevent instances from
+functioning correctly, such as, issues with a cinder volume I/O path.
+
+More details are available in the `user story
+<http://specs.openstack.org/openstack/openstack-user-stories/user-stories/proposed/ha_vm.html>`_
+co-authored by OpenStack's HA community and `Product Working Group
+<https://wiki.openstack.org/wiki/ProductTeam>`_ (PWG), where this feature is
+identified as missing functionality in OpenStack, which
+should be addressed with high priority.
+
+Existing solutions
+~~~~~~~~~~~~~~~~~~
+
+The architectural challenges of instance HA and several currently
+existing solutions were presented in `a talk at the Austin summit
+<https://www.openstack.org/videos/video/high-availability-for-pets-and-hypervisors-state-of-the-nation>`_,
+for which `slides are also available <http://aspiers.github.io/openstack-summit-2016-austin-compute-ha/>`_.
+
+The code for three of these solutions can be found online at the following
+links:
+
+* `a mistral-based auto-recovery workflow
+  <https://github.com/gryf/mistral-evacuate>`_, by Intel
+* `masakari <https://launchpad.net/masakari>`_, by NTT
+* `OCF RAs
+  <http://aspiers.github.io/openstack-summit-2016-austin-compute-ha/#/ocf-pros-cons>`_,
+  as used by Red Hat and SUSE
+
+Current upstream work
+~~~~~~~~~~~~~~~~~~~~~
+
+Work is in progress on a unified approach, which combines the best
+aspects of existing upstream solutions. More details are available on
+`the HA VMs user story wiki
+<https://wiki.openstack.org/wiki/ProductTeam/User_Stories/HA_VMs>`_.
+
+To get involved with this work, see the section on the
+:doc:`ha-community`.
diff --git a/doc/ha-guide-draft/source/conf.py b/doc/ha-guide-draft/source/conf.py
new file mode 100644
index 0000000000..6500ece076
--- /dev/null
+++ b/doc/ha-guide-draft/source/conf.py
@@ -0,0 +1,301 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import os
+# import sys
+
+import openstackdocstheme
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+# templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'High Availability Guide Draft'
+bug_tag = u'ha-guide-draft'
+copyright = u'2017, OpenStack contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.0.1'
+
+# A few variables have to be set for the log-a-bug feature.
+#   giturl: The location of conf.py on Git. Must be set manually.
+#   gitsha: The SHA checksum of the bug description. Automatically extracted from git log.
+#   bug_tag: Tag for categorizing the bug. Must be set manually.
+# These variables are passed to the logabug code via html_context.
+giturl = u'https://git.openstack.org/cgit/openstack/openstack-manuals/tree/doc/ha-guide-draft/source'
+git_cmd = "/usr/bin/git log | head -n1 | cut -f2 -d' '"
+gitsha = os.popen(git_cmd).read().strip('\n')
+html_context = {"gitsha": gitsha, "bug_tag": bug_tag,
+                "giturl": giturl}
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['common/cli*', 'common/nova*',
+                    'common/get-started*', 'common/dashboard*']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'openstackdocs'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = [openstackdocstheme.get_html_theme_path()]
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+# html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+# html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = []
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+# html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+# So that we can enable "log-a-bug" links from each output HTML page, this
+# variable must be set to a format that includes year, month, day, hours and
+# minutes.
+html_last_updated_fmt = '%Y-%m-%d %H:%M'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+# html_domain_indices = True
+
+# If false, no index is generated.
+html_use_index = False
+
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+html_show_sourcelink = False
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'ha-guide-draft'
+
+# If true, publish source files
+html_copy_source = False
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_engine = 'xelatex'
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+
+    # set font (TODO: different fonts for translated PDF document builds)
+    'fontenc': '\\usepackage{fontspec}',
+    'fontpkg': '''\
+\defaultfontfeatures{Scale=MatchLowercase}
+\setmainfont{Liberation Serif}
+\setsansfont{Liberation Sans}
+\setmonofont[SmallCapsFont={Liberation Mono}]{Liberation Mono}
+''',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    # 'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    ('index', 'HAGuideDraft.tex', u'High Availability Guide Draft',
+     u'OpenStack contributors', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+
+# If false, no module index is generated.
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'haguidedraft', u'High Availability GuideDraft',
+     [u'OpenStack contributors'], 1)
+]
+
+# If true, show URL addresses after external links.
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    ('index', 'HAGuideDraft', u'High Availability Guide Draft',
+     u'OpenStack contributors', 'HAGuideDraft',
+     'This guide shows OpenStack operators and deployers how to configure'
+     'OpenStack Networking to be robust and fault-tolerant.', 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+# texinfo_no_detailmenu = False
+
+# -- Options for Internationalization output ------------------------------
+locale_dirs = ['locale/']
diff --git a/doc/ha-guide-draft/source/control-plane-stateful.rst b/doc/ha-guide-draft/source/control-plane-stateful.rst
new file mode 100644
index 0000000000..26bfdcea40
--- /dev/null
+++ b/doc/ha-guide-draft/source/control-plane-stateful.rst
@@ -0,0 +1,342 @@
+=================================
+Configuring the stateful services
+=================================
+.. to do: scope how in depth we want these sections to be
+
+Database for high availability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Galera
+------
+
+The first step is to install the database that sits at the heart of the
+cluster. To implement high availability, run an instance of the database on
+each controller node and use Galera Cluster to provide replication between
+them. Galera Cluster is a synchronous multi-master database cluster, based
+on MySQL and the InnoDB storage engine. It is a high-availability service
+that provides high system uptime, no data loss, and scalability for growth.
+
+You can achieve high availability for the OpenStack database in many
+different ways, depending on the type of database that you want to use.
+There are three implementations of Galera Cluster available to you:
+
+- `Galera Cluster for MySQL <http://galeracluster.com/>`_: The MySQL
+  reference implementation from Codership, Oy.
+- `MariaDB Galera Cluster <https://mariadb.org/>`_: The MariaDB
+  implementation of Galera Cluster, which is commonly supported in
+  environments based on Red Hat distributions.
+- `Percona XtraDB Cluster <http://www.percona.com/>`_: The XtraDB
+  implementation of Galera Cluster from Percona.
+
+In addition to Galera Cluster, you can also achieve high availability
+through other database options, such as PostgreSQL, which has its own
+replication system.
+
+Pacemaker active/passive with HAproxy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Replicated storage
+------------------
+
+For example: DRBD
+
+Shared storage
+--------------
+
+Messaging service for high availability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+RabbitMQ
+--------
+
+An AMQP (Advanced Message Queuing Protocol) compliant message bus is
+required for most OpenStack components in order to coordinate the
+execution of jobs entered into the system.
+
+The most popular AMQP implementation used in OpenStack installations
+is RabbitMQ.
+
+RabbitMQ nodes fail over on the application and the infrastructure layers.
+
+The application layer is controlled by the ``oslo.messaging``
+configuration options for multiple AMQP hosts. If the AMQP node fails,
+the application reconnects to the next one configured within the
+specified reconnect interval. The specified reconnect interval
+constitutes its SLA.
+
+On the infrastructure layer, the SLA is the time for which RabbitMQ
+cluster reassembles. Several cases are possible. The Mnesia keeper
+node is the master of the corresponding Pacemaker resource for
+RabbitMQ. When it fails, the result is a full AMQP cluster downtime
+interval. Normally, its SLA is no more than several minutes. Failure
+of another node that is a slave of the corresponding Pacemaker
+resource for RabbitMQ results in no AMQP cluster downtime at all.
+
+.. until we've determined the content depth, I've transferred RabbitMQ
+   configuration below from the old HA guide (darrenc)
+
+Making the RabbitMQ service highly available involves the following steps:
+
+- :ref:`Install RabbitMQ<rabbitmq-install>`
+
+- :ref:`Configure RabbitMQ for HA queues<rabbitmq-configure>`
+
+- :ref:`Configure OpenStack services to use RabbitMQ HA queues
+  <rabbitmq-services>`
+
+.. note::
+
+   Access to RabbitMQ is not normally handled by HAProxy. Instead,
+   consumers must be supplied with the full list of hosts running
+   RabbitMQ with ``rabbit_hosts`` and turn on the ``rabbit_ha_queues``
+   option. For more information, read the `core issue
+   <http://people.redhat.com/jeckersb/private/vip-failover-tcp-persist.html>`_.
+   For more detail, read the `history and solution
+   <http://john.eckersberg.com/improving-ha-failures-with-tcp-timeouts.html>`_.
+
+.. _rabbitmq-install:
+
+Install RabbitMQ
+^^^^^^^^^^^^^^^^
+
+The commands for installing RabbitMQ are specific to the Linux distribution
+you are using.
+
+For Ubuntu or Debian:
+
+.. code-block: console
+
+   # apt-get install rabbitmq-server
+
+For RHEL, Fedora, or CentOS:
+
+.. code-block: console
+
+   # yum install rabbitmq-server
+
+For openSUSE:
+
+.. code-block: console
+
+   # zypper install rabbitmq-server
+
+For SLES 12:
+
+.. code-block: console
+
+   # zypper addrepo -f obs://Cloud:OpenStack:Kilo/SLE_12 Kilo
+   [Verify the fingerprint of the imported GPG key. See below.]
+   # zypper install rabbitmq-server
+
+.. note::
+
+   For SLES 12, the packages are signed by GPG key 893A90DAD85F9316.
+   You should verify the fingerprint of the imported GPG key before using it.
+
+   .. code-block:: none
+
+      Key ID: 893A90DAD85F9316
+      Key Name: Cloud:OpenStack OBS Project <Cloud:OpenStack@build.opensuse.org>
+      Key Fingerprint: 35B34E18ABC1076D66D5A86B893A90DAD85F9316
+      Key Created: Tue Oct  8 13:34:21 2013
+      Key Expires: Thu Dec 17 13:34:21 2015
+
+For more information, see the official installation manual for the
+distribution:
+
+- `Debian and Ubuntu <https://www.rabbitmq.com/install-debian.html>`_
+- `RPM based <https://www.rabbitmq.com/install-rpm.html>`_
+  (RHEL, Fedora, CentOS, openSUSE)
+
+.. _rabbitmq-configure:
+
+Configure RabbitMQ for HA queues
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. [TODO: This section should begin with a brief mention
+.. about what HA queues are and why they are valuable, etc]
+
+.. [TODO: replace "currently" with specific release names]
+
+.. [TODO: Does this list need to be updated? Perhaps we need a table
+.. that shows each component and the earliest release that allows it
+.. to work with HA queues.]
+
+The following components/services can work with HA queues:
+
+- OpenStack Compute
+- OpenStack Block Storage
+- OpenStack Networking
+- Telemetry
+
+Consider that, while exchanges and bindings survive the loss of individual
+nodes, queues and their messages do not because a queue and its contents
+are located on one node. If we lose this node, we also lose the queue.
+
+Mirrored queues in RabbitMQ improve the availability of service since
+it is resilient to failures.
+
+Production servers should run (at least) three RabbitMQ servers for testing
+and demonstration purposes, however it is possible to run only two servers.
+In this section, we configure two nodes, called ``rabbit1`` and ``rabbit2``.
+To build a broker, ensure that all nodes have the same Erlang cookie file.
+
+.. [TODO: Should the example instead use a minimum of three nodes?]
+
+#. Stop RabbitMQ and copy the cookie from the first node to each of the
+   other node(s):
+
+   .. code-block:: console
+
+      # scp /var/lib/rabbitmq/.erlang.cookie root@NODE:/var/lib/rabbitmq/.erlang.cookie
+
+#. On each target node, verify the correct owner,
+   group, and permissions of the file :file:`erlang.cookie`:
+
+   .. code-block:: console
+
+      # chown rabbitmq:rabbitmq /var/lib/rabbitmq/.erlang.cookie
+      # chmod 400 /var/lib/rabbitmq/.erlang.cookie
+
+#. Start the message queue service on all nodes and configure it to start
+   when the system boots. On Ubuntu, it is configured by default.
+
+   On CentOS, RHEL, openSUSE, and SLES:
+
+   .. code-block:: console
+
+      # systemctl enable rabbitmq-server.service
+      # systemctl start rabbitmq-server.service
+
+#. Verify that the nodes are running:
+
+   .. code-block:: console
+
+      # rabbitmqctl cluster_status
+      Cluster status of node rabbit@NODE...
+      [{nodes,[{disc,[rabbit@NODE]}]},
+       {running_nodes,[rabbit@NODE]},
+       {partitions,[]}]
+      ...done.
+
+#. Run the following commands on each node except the first one:
+
+   .. code-block:: console
+
+      # rabbitmqctl stop_app
+      Stopping node rabbit@NODE...
+      ...done.
+      # rabbitmqctl join_cluster --ram rabbit@rabbit1
+      # rabbitmqctl start_app
+      Starting node rabbit@NODE ...
+      ...done.
+
+.. note::
+
+   The default node type is a disc node. In this guide, nodes
+   join the cluster as RAM nodes.
+
+#. Verify the cluster status:
+
+   .. code-block:: console
+
+      # rabbitmqctl cluster_status
+      Cluster status of node rabbit@NODE...
+      [{nodes,[{disc,[rabbit@rabbit1]},{ram,[rabbit@NODE]}]}, \
+          {running_nodes,[rabbit@NODE,rabbit@rabbit1]}]
+
+   If the cluster is working, you can create usernames and passwords
+   for the queues.
+
+#. To ensure that all queues except those with auto-generated names
+   are mirrored across all running nodes,
+   set the ``ha-mode`` policy key to all
+   by running the following command on one of the nodes:
+
+   .. code-block:: console
+
+      # rabbitmqctl set_policy ha-all '^(?!amq\.).*' '{"ha-mode": "all"}'
+
+More information is available in the RabbitMQ documentation:
+
+- `Highly Available Queues <https://www.rabbitmq.com/ha.html>`_
+- `Clustering Guide <https://www.rabbitmq.com/clustering.html>`_
+
+.. note::
+
+   As another option to make RabbitMQ highly available, RabbitMQ contains the
+   OCF scripts for the Pacemaker cluster resource agents since version 3.5.7.
+   It provides the active/active RabbitMQ cluster with mirrored queues.
+   For more information, see `Auto-configuration of a cluster with
+   a Pacemaker <https://www.rabbitmq.com/pacemaker.html>`_.
+
+.. _rabbitmq-services:
+
+Configure OpenStack services to use Rabbit HA queues
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Configure the OpenStack components to use at least two RabbitMQ nodes.
+
+Use these steps to configurate all services using RabbitMQ:
+
+#. RabbitMQ HA cluster ``host:port`` pairs:
+
+   .. code-block:: console
+
+      rabbit_hosts=rabbit1:5672,rabbit2:5672,rabbit3:5672
+
+#. Retry connecting with RabbitMQ:
+
+   .. code-block:: console
+
+      rabbit_retry_interval=1
+
+#. How long to back-off for between retries when connecting to RabbitMQ:
+
+   .. code-block:: console
+
+      rabbit_retry_backoff=2
+
+#. Maximum retries with trying to connect to RabbitMQ (infinite by default):
+
+   .. code-block:: console
+
+      rabbit_max_retries=0
+
+#. Use durable queues in RabbitMQ:
+
+   .. code-block:: console
+
+      rabbit_durable_queues=true
+
+#. Use HA queues in RabbitMQ (``x-ha-policy: all``):
+
+   .. code-block:: console
+
+      rabbit_ha_queues=true
+
+.. note::
+
+   If you change the configuration from an old set-up
+   that did not use HA queues, restart the service:
+
+   .. code-block:: console
+
+      # rabbitmqctl stop_app
+      # rabbitmqctl reset
+      # rabbitmqctl start_app
+
+
+
+
+
+Pacemaker active/passive
+------------------------
+
+
+
+Mirrored queues
+---------------
+
+Qpid
+----
diff --git a/doc/ha-guide-draft/source/control-plane-stateless.rst b/doc/ha-guide-draft/source/control-plane-stateless.rst
new file mode 100644
index 0000000000..2daa2a5be2
--- /dev/null
+++ b/doc/ha-guide-draft/source/control-plane-stateless.rst
@@ -0,0 +1,518 @@
+==============================
+Configuring stateless services
+==============================
+
+.. to do: scope what details we want on the following services
+
+API services
+~~~~~~~~~~~~
+
+Load-balancer
+~~~~~~~~~~~~~
+
+HAProxy
+-------
+
+HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
+for TCP or HTTP applications. It is particularly suited for web crawling
+under very high loads while needing persistence or Layer 7 processing.
+It realistically supports tens of thousands of connections with recent
+hardware.
+
+Each instance of HAProxy configures its front end to accept connections only
+to the virtual IP (VIP) address. The HAProxy back end (termination
+point) is a list of all the IP addresses of instances for load balancing.
+
+.. note::
+
+   Ensure your HAProxy installation is not a single point of failure,
+   it is advisable to have multiple HAProxy instances running.
+
+   You can also ensure the availability by other means, using Keepalived
+   or Pacemaker.
+
+Alternatively, you can use a commercial load balancer, which is hardware
+or software. We recommend a hardware load balancer as it generally has
+good performance.
+
+For detailed instructions about installing HAProxy on your nodes,
+see the HAProxy `official documentation <http://www.haproxy.org/#docs>`_.
+
+Configuring HAProxy
+^^^^^^^^^^^^^^^^^^^
+
+#. Restart the HAProxy service.
+
+#. Locate your HAProxy instance on each OpenStack controller in your
+   environment. The following is an example ``/etc/haproxy/haproxy.cfg``
+   configuration file. Configure your instance using the following
+   configuration file, you will need a copy of it on each
+   controller node.
+
+
+   .. code-block:: none
+
+        global
+         chroot  /var/lib/haproxy
+         daemon
+         group  haproxy
+         maxconn  4000
+         pidfile  /var/run/haproxy.pid
+         user  haproxy
+
+       defaults
+         log  global
+         maxconn  4000
+         option  redispatch
+         retries  3
+         timeout  http-request 10s
+         timeout  queue 1m
+         timeout  connect 10s
+         timeout  client 1m
+         timeout  server 1m
+         timeout  check 10s
+
+        listen dashboard_cluster
+         bind <Virtual IP>:443
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:443 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:443 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:443 check inter 2000 rise 2 fall 5
+
+        listen galera_cluster
+         bind <Virtual IP>:3306
+         balance  source
+         option  mysql-check
+         server controller1 10.0.0.12:3306 check port 9200 inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:3306 backup check port 9200 inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:3306 backup check port 9200 inter 2000 rise 2 fall 5
+
+        listen glance_api_cluster
+         bind <Virtual IP>:9292
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:9292 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:9292 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:9292 check inter 2000 rise 2 fall 5
+
+        listen glance_registry_cluster
+         bind <Virtual IP>:9191
+         balance  source
+         option  tcpka
+         option  tcplog
+         server controller1 10.0.0.12:9191 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:9191 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:9191 check inter 2000 rise 2 fall 5
+
+        listen keystone_admin_cluster
+         bind <Virtual IP>:35357
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:35357 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:35357 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:35357 check inter 2000 rise 2 fall 5
+
+        listen keystone_public_internal_cluster
+         bind <Virtual IP>:5000
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:5000 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:5000 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:5000 check inter 2000 rise 2 fall 5
+
+        listen nova_ec2_api_cluster
+         bind <Virtual IP>:8773
+         balance  source
+         option  tcpka
+         option  tcplog
+         server controller1 10.0.0.12:8773 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:8773 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:8773 check inter 2000 rise 2 fall 5
+
+        listen nova_compute_api_cluster
+         bind <Virtual IP>:8774
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:8774 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:8774 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:8774 check inter 2000 rise 2 fall 5
+
+        listen nova_metadata_api_cluster
+         bind <Virtual IP>:8775
+         balance  source
+         option  tcpka
+         option  tcplog
+         server controller1 10.0.0.12:8775 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:8775 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:8775 check inter 2000 rise 2 fall 5
+
+        listen cinder_api_cluster
+         bind <Virtual IP>:8776
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:8776 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:8776 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:8776 check inter 2000 rise 2 fall 5
+
+        listen ceilometer_api_cluster
+         bind <Virtual IP>:8777
+         balance  source
+         option  tcpka
+         option  tcplog
+         server controller1 10.0.0.12:8777 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:8777 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:8777 check inter 2000 rise 2 fall 5
+
+        listen nova_vncproxy_cluster
+         bind <Virtual IP>:6080
+         balance  source
+         option  tcpka
+         option  tcplog
+         server controller1 10.0.0.12:6080 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:6080 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:6080 check inter 2000 rise 2 fall 5
+
+        listen neutron_api_cluster
+         bind <Virtual IP>:9696
+         balance  source
+         option  tcpka
+         option  httpchk
+         option  tcplog
+         server controller1 10.0.0.12:9696 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:9696 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:9696 check inter 2000 rise 2 fall 5
+
+        listen swift_proxy_cluster
+         bind <Virtual IP>:8080
+         balance  source
+         option  tcplog
+         option  tcpka
+         server controller1 10.0.0.12:8080 check inter 2000 rise 2 fall 5
+         server controller2 10.0.0.13:8080 check inter 2000 rise 2 fall 5
+         server controller3 10.0.0.14:8080 check inter 2000 rise 2 fall 5
+
+   .. note::
+
+      The Galera cluster configuration directive ``backup`` indicates
+      that two of the three controllers are standby nodes.
+      This ensures that only one node services write requests
+      because OpenStack support for multi-node writes is not yet production-ready.
+
+   .. note::
+
+      The Telemetry API service configuration does not have the ``option httpchk``
+      directive as it cannot process this check properly.
+
+.. TODO: explain why the Telemetry API is so special
+
+#. Configure the kernel parameter to allow non-local IP binding. This allows
+   running HAProxy instances to bind to a VIP for failover. Add following line
+   to ``/etc/sysctl.conf``:
+
+   .. code-block:: none
+
+      net.ipv4.ip_nonlocal_bind = 1
+
+#. Restart the host or, to make changes work immediately, invoke:
+
+   .. code-block:: console
+
+      $ sysctl -p
+
+#. Add HAProxy to the cluster and ensure the VIPs can only run on machines
+   where HAProxy is active:
+
+   ``pcs``
+
+   .. code-block:: console
+
+      $ pcs resource create lb-haproxy systemd:haproxy --clone
+      $ pcs constraint order start vip then lb-haproxy-clone kind=Optional
+      $ pcs constraint colocation add lb-haproxy-clone with vip
+
+   ``crmsh``
+
+   .. code-block:: console
+
+      $ crm cib new conf-haproxy
+      $ crm configure primitive haproxy lsb:haproxy op monitor interval="1s"
+      $ crm configure clone haproxy-clone haproxy
+      $ crm configure colocation vip-with-haproxy inf: vip haproxy-clone
+      $ crm configure order haproxy-after-vip mandatory: vip haproxy-clone
+
+
+Pacemaker versus systemd
+------------------------
+
+Memcached
+---------
+
+Memcached is a general-purpose distributed memory caching system. It
+is used to speed up dynamic database-driven websites by caching data
+and objects in RAM to reduce the number of times an external data
+source must be read.
+
+Memcached is a memory cache demon that can be used by most OpenStack
+services to store ephemeral data, such as tokens.
+
+Access to Memcached is not handled by HAProxy because replicated
+access is currently in an experimental state. Instead, OpenStack
+services must be supplied with the full list of hosts running
+Memcached.
+
+The Memcached client implements hashing to balance objects among the
+instances. Failure of an instance impacts only a percentage of the
+objects and the client automatically removes it from the list of
+instances. The SLA is several minutes.
+
+
+Highly available API services
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Identity API
+------------
+
+Ensure you have read the
+`OpenStack Identity service getting started documentation
+<https://docs.openstack.org/admin-guide/common/get-started-identity.html>`_.
+
+.. to do: reference controller-ha-identity and see if section involving
+   adding to pacemaker is in scope
+
+
+Add OpenStack Identity resource to Pacemaker
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following section(s) detail how to add the Identity service
+to Pacemaker on SUSE and Red Hat.
+
+SUSE
+----
+
+SUSE Enterprise Linux and SUSE-based distributions, such as openSUSE,
+use a set of OCF agents for controlling OpenStack services.
+
+#. Run the following commands to download the OpenStack Identity resource
+   to Pacemaker:
+
+   .. code-block:: console
+
+      # cd /usr/lib/ocf/resource.d
+      # mkdir openstack
+      # cd openstack
+      # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/keystone
+      # chmod a+rx *
+
+#. Add the Pacemaker configuration for the OpenStack Identity resource
+   by running the following command to connect to the Pacemaker cluster:
+
+   .. code-block:: console
+
+      # crm configure
+
+#. Add the following cluster resources:
+
+   .. code-block:: console
+
+      clone p_keystone ocf:openstack:keystone \
+      params config="/etc/keystone/keystone.conf" os_password="secretsecret" os_username="admin" os_tenant_name="admin" os_auth_url="http://10.0.0.11:5000/v2.0/" \
+      op monitor interval="30s" timeout="30s"
+
+   .. note::
+
+      This configuration creates ``p_keystone``,
+      a resource for managing the OpenStack Identity service.
+
+#. Commit your configuration changes from the :command:`crm configure` menu
+   with the following command:
+
+   .. code-block:: console
+
+      # commit
+
+   The :command:`crm configure` supports batch input. You may have to copy and
+   paste the above lines into your live Pacemaker configuration, and then make
+   changes as required.
+
+   For example, you may enter ``edit p_ip_keystone`` from the
+   :command:`crm configure` menu and edit the resource to match your preferred
+   virtual IP address.
+
+   Pacemaker now starts the OpenStack Identity service and its dependent
+   resources on all of your nodes.
+
+Red Hat
+--------
+
+For Red Hat Enterprise Linux and Red Hat-based Linux distributions,
+the following process uses Systemd unit files.
+
+.. code-block:: console
+
+   # pcs resource create openstack-keystone systemd:openstack-keystone --clone interleave=true
+
+.. _identity-config-identity:
+
+Configure OpenStack Identity service
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+#. Edit the :file:`keystone.conf` file
+   to change the values of the :manpage:`bind(2)` parameters:
+
+   .. code-block:: ini
+
+      bind_host = 10.0.0.12
+      public_bind_host = 10.0.0.12
+      admin_bind_host = 10.0.0.12
+
+   The ``admin_bind_host`` parameter
+   lets you use a private network for admin access.
+
+#. To be sure that all data is highly available,
+   ensure that everything is stored in the MySQL database
+   (which is also highly available):
+
+   .. code-block:: ini
+
+      [catalog]
+      driver = keystone.catalog.backends.sql.Catalog
+      # ...
+      [identity]
+      driver = keystone.identity.backends.sql.Identity
+      # ...
+
+#. If the Identity service will be sending ceilometer notifications
+   and your message bus is configured for high availability, you will
+   need to ensure that the Identity service is correctly configured to
+   use it.
+
+.. _identity-services-config:
+
+Configure OpenStack services to use the highly available OpenStack Identity
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Your OpenStack services now point their OpenStack Identity configuration
+to the highly available virtual cluster IP address.
+
+#. For OpenStack Compute service, (if your OpenStack Identity service
+   IP address is 10.0.0.11) use the following configuration in the
+   :file:`api-paste.ini` file:
+
+  .. code-block:: ini
+
+     auth_host = 10.0.0.11
+
+#. Create the OpenStack Identity Endpoint with this IP address.
+
+   .. note::
+
+      If you are using both private and public IP addresses,
+      create two virtual IP addresses and define the endpoint. For
+      example:
+
+   .. code-block:: console
+
+      $ openstack endpoint create --region $KEYSTONE_REGION \
+      $service-type public http://PUBLIC_VIP:5000/v2.0
+      $ openstack endpoint create --region $KEYSTONE_REGION \
+      $service-type admin http://10.0.0.11:35357/v2.0
+      $ openstack endpoint create --region $KEYSTONE_REGION \
+      $service-type internal http://10.0.0.11:5000/v2.0
+
+#. If you are using Dashboard (horizon), edit the :file:`local_settings.py`
+   file to include the following:
+
+      .. code-block:: ini
+
+         OPENSTACK_HOST = 10.0.0.11
+
+
+Telemetry API
+-------------
+
+The Telemetry polling agent can be configured to partition its polling
+workload between multiple agents. This enables high availability (HA).
+
+Both the central and the compute agent can run in an HA deployment.
+This means that multiple instances of these services can run in
+parallel with workload partitioning among these running instances.
+
+The `Tooz <https://pypi.python.org/pypi/tooz>`_ library provides
+the coordination within the groups of service instances.
+It provides an API above several back ends that can be used for building
+distributed applications.
+
+Tooz supports
+`various drivers <https://docs.openstack.org/developer/tooz/drivers.html>`_
+including the following back end solutions:
+
+* `Zookeeper <http://zookeeper.apache.org/>`_:
+    Recommended solution by the Tooz project.
+
+* `Redis <http://redis.io/>`_:
+    Recommended solution by the Tooz project.
+
+* `Memcached <http://memcached.org/>`_:
+    Recommended for testing.
+
+You must configure a supported Tooz driver for the HA deployment of
+the Telemetry services.
+
+For information about the required configuration options
+to set in the :file:`ceilometer.conf`, see the `coordination section
+<https://docs.openstack.org/ocata/config-reference/telemetry.html>`_
+in the OpenStack Configuration Reference.
+
+.. note::
+
+   Only one instance for the central and compute agent service(s) is able
+   to run and function correctly if the ``backend_url`` option is not set.
+
+The availability check of the instances is provided by heartbeat messages.
+When the connection with an instance is lost, the workload will be
+reassigned within the remaining instances in the next polling cycle.
+
+.. note::
+
+   Memcached uses a timeout value, which should always be set to
+   a value that is higher than the heartbeat value set for Telemetry.
+
+For backward compatibility and supporting existing deployments, the central
+agent configuration supports using different configuration files. This is for
+groups of service instances that are running in parallel.
+For enabling this configuration, set a value for the
+``partitioning_group_prefix`` option in the
+`polling section <https://docs.openstack.org/ocata/config-reference/telemetry/telemetry-config-options.html>`_
+in the OpenStack Configuration Reference.
+
+.. warning::
+
+   For each sub-group of the central agent pool with the same
+   ``partitioning_group_prefix``, a disjoint subset of meters must be polled
+   to avoid samples being missing or duplicated. The list of meters to poll
+   can be set in the :file:`/etc/ceilometer/pipeline.yaml` configuration file.
+   For more information about pipelines see the `Data processing and pipelines
+   <https://docs.openstack.org/admin-guide/telemetry-data-pipelines.html>`_
+   section.
+
+To enable the compute agent to run multiple instances simultaneously with
+workload partitioning, the ``workload_partitioning`` option must be set to
+``True`` under the `compute section <https://docs.openstack.org/ocata/config-reference/telemetry.html>`_
+in the :file:`ceilometer.conf` configuration file.
+
+
+.. To Do: Cover any other projects here with API services which require specific
+   HA details.
diff --git a/doc/ha-guide-draft/source/control-plane.rst b/doc/ha-guide-draft/source/control-plane.rst
new file mode 100644
index 0000000000..36ede0826b
--- /dev/null
+++ b/doc/ha-guide-draft/source/control-plane.rst
@@ -0,0 +1,9 @@
+===========================
+Configuring a control plane
+===========================
+
+.. toctree::
+   :maxdepth: 2
+
+   control-plane-stateless.rst
+   control-plane-stateful.rst
diff --git a/doc/ha-guide-draft/source/figures/Cluster-deployment-collapsed.png b/doc/ha-guide-draft/source/figures/Cluster-deployment-collapsed.png
new file mode 100644
index 0000000000..91feec0bb1
Binary files /dev/null and b/doc/ha-guide-draft/source/figures/Cluster-deployment-collapsed.png differ
diff --git a/doc/ha-guide-draft/source/figures/Cluster-deployment-segregated.png b/doc/ha-guide-draft/source/figures/Cluster-deployment-segregated.png
new file mode 100644
index 0000000000..a504ae18aa
Binary files /dev/null and b/doc/ha-guide-draft/source/figures/Cluster-deployment-segregated.png differ
diff --git a/doc/ha-guide-draft/source/ha-community.rst b/doc/ha-guide-draft/source/ha-community.rst
new file mode 100644
index 0000000000..cba0598b12
--- /dev/null
+++ b/doc/ha-guide-draft/source/ha-community.rst
@@ -0,0 +1,17 @@
+============
+HA community
+============
+
+The OpenStack HA community holds `weekly IRC meetings
+<https://wiki.openstack.org/wiki/Meetings/HATeamMeeting>`_ to discuss
+a range of topics relating to HA in OpenStack. Everyone interested is
+encouraged to attend. The `logs of all previous meetings
+<http://eavesdrop.openstack.org/meetings/ha/>`_ are available to read.
+
+You can contact the HA community directly in `the #openstack-ha
+channel on Freenode IRC <https://wiki.openstack.org/wiki/IRC>`_, or by
+sending mail to the `openstack-dev
+<https://wiki.openstack.org/wiki/Mailing_Lists#Future_Development>`_
+or `openstack-docs
+<http://lists.openstack.org/cgi-bin/mailman/listinfo/openstack-docs>`_
+mailing list with the ``[HA]`` prefix in the ``Subject`` header.
diff --git a/doc/ha-guide-draft/source/index.rst b/doc/ha-guide-draft/source/index.rst
new file mode 100644
index 0000000000..87bb04a894
--- /dev/null
+++ b/doc/ha-guide-draft/source/index.rst
@@ -0,0 +1,38 @@
+=================================
+OpenStack High Availability Guide
+=================================
+
+Abstract
+~~~~~~~~
+
+This guide describes how to install and configure OpenStack for high
+availability. It supplements the Installation Tutorials and Guides
+and assumes that you are familiar with the material in those guides.
+
+This guide documents OpenStack Ocata, Newton, and Mitaka releases.
+
+.. warning::
+
+   This guide is a work-in-progress and changing rapidly
+   while we continue to test and enhance the guidance. There are
+   open `TODO` items throughout and available on the OpenStack manuals
+   `bug list <https://bugs.launchpad.net/openstack-manuals/>`_.
+   Please help where you are able.
+
+.. toctree::
+   :maxdepth: 1
+
+   common/conventions.rst
+   overview.rst
+   intro-ha.rst
+   intro-os-ha.rst
+   control-plane.rst
+   networking-ha.rst
+   storage-ha.rst
+   compute-node-ha.rst
+   monitoring.rst
+   testing.rst
+   ref-arch-examples.rst
+   ha-community.rst
+   common/app-support.rst
+   common/glossary.rst
diff --git a/doc/ha-guide-draft/source/intro-ha-common-tech.rst b/doc/ha-guide-draft/source/intro-ha-common-tech.rst
new file mode 100644
index 0000000000..572804f60a
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-ha-common-tech.rst
@@ -0,0 +1,98 @@
+========================
+Commonly used technology
+========================
+
+Hardware
+~~~~~~~~
+The following are the standard hardware requirements:
+
+- Provider networks: See the *Overview -> Networking Option 1: Provider
+  networks* section of the
+  `Install Tutorials and Guides <https://docs.openstack.org/project-install-guide/ocata>`_
+  depending on your distribution.
+- Self-service networks: See the *Overview -> Networking Option 2:
+  Self-service networks* section of the
+  `Install Tutorials and Guides <https://docs.openstack.org/project-install-guide/ocata>`_
+  depending on your distribution.
+
+Load balancers
+--------------
+
+Redundant switches
+------------------
+
+Bonded interfaces
+-----------------
+
+Storage
+-------
+
+Software
+~~~~~~~~
+
+HAproxy
+-------
+
+HAProxy provides a fast and reliable HTTP reverse proxy and load balancer
+for TCP or HTTP applications. It is particularly suited for web crawling
+under very high loads while needing persistence or Layer 7 processing.
+It realistically supports tens of thousands of connections with recent
+hardware.
+
+.. note::
+
+   Ensure your HAProxy installation is not a single point of failure,
+   it is advisable to have multiple HAProxy instances running.
+
+   You can also ensure the availability by other means, using Keepalived
+   or Pacemaker.
+
+Alternatively, you can use a commercial load balancer, which is hardware
+or software. We recommend a hardware load balancer as it generally has
+good performance.
+
+For detailed instructions about installing HAProxy on your nodes,
+see the HAProxy `official documentation <http://www.haproxy.org/#docs>`_.
+
+keepalived
+----------
+
+`keepalived <http://www.keepalived.org/>`_ is a routing software that
+provides facilities for load balancing and high-availability to Linux
+system and Linux based infrastructures.
+
+Keepalived implements a set of checkers to dynamically and
+adaptively maintain and manage loadbalanced server pool according
+their health.
+
+The keepalived daemon can be used to monitor services or systems and
+to automatically failover to a standby if problems occur.
+
+Pacemaker
+---------
+
+`Pacemaker <http://clusterlabs.org/>`_ cluster stack is a state-of-the-art
+high availability and load balancing stack for the Linux platform.
+Pacemaker is used to make OpenStack infrastructure highly available.
+
+Pacemaker relies on the
+`Corosync <http://corosync.github.io/corosync/>`_ messaging layer
+for reliable cluster communications. Corosync implements the Totem single-ring
+ordering and membership protocol. It also provides UDP and InfiniBand based
+messaging, quorum, and cluster membership to Pacemaker.
+
+Pacemaker does not inherently understand the applications it manages.
+Instead, it relies on resource agents (RAs) that are scripts that encapsulate
+the knowledge of how to start, stop, and check the health of each application
+managed by the cluster.
+
+These agents must conform to one of the `OCF <https://github.com/ClusterLabs/
+OCF-spec/blob/master/ra/resource-agent-api.md>`_,
+`SysV Init <http://refspecs.linux-foundation.org/LSB_3.0.0/LSB-Core-generic/
+LSB-Core-generic/iniscrptact.html>`_, Upstart, or Systemd standards.
+
+Pacemaker ships with a large set of OCF agents (such as those managing
+MySQL databases, virtual IP addresses, and RabbitMQ), but can also use
+any agents already installed on your system and can be extended with
+your own (see the
+`developer guide <http://www.linux-ha.org/doc/dev-guides/ra-dev-guide.html>`_).
diff --git a/doc/ha-guide-draft/source/intro-ha-key-concepts.rst b/doc/ha-guide-draft/source/intro-ha-key-concepts.rst
new file mode 100644
index 0000000000..4a75d53b2c
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-ha-key-concepts.rst
@@ -0,0 +1,147 @@
+============
+Key concepts
+============
+
+Redundancy and failover
+~~~~~~~~~~~~~~~~~~~~~~~
+
+High availability is implemented with redundant hardware
+running redundant instances of each service.
+If one piece of hardware running one instance of a service fails,
+the system can then failover to use another instance of a service
+that is running on hardware that did not fail.
+
+A crucial aspect of high availability
+is the elimination of single points of failure (SPOFs).
+A SPOF is an individual piece of equipment or software
+that causes system downtime or data loss if it fails.
+In order to eliminate SPOFs, check that mechanisms exist for redundancy of:
+
+- Network components, such as switches and routers
+
+- Applications and automatic service migration
+
+- Storage components
+
+- Facility services such as power, air conditioning, and fire protection
+
+In the event that a component fails and a back-up system must take on
+its load, most high availability systems will replace the failed
+component as quickly as possible to maintain necessary redundancy. This
+way time spent in a degraded protection state is minimized.
+
+Most high availability systems fail in the event of multiple
+independent (non-consequential) failures. In this case, most
+implementations favor protecting data over maintaining availability.
+
+High availability systems typically achieve an uptime percentage of
+99.99% or more, which roughly equates to less than an hour of
+cumulative downtime per year. In order to achieve this, high
+availability systems should keep recovery times after a failure to
+about one to two minutes, sometimes significantly less.
+
+OpenStack currently meets such availability requirements for its own
+infrastructure services, meaning that an uptime of 99.99% is feasible
+for the OpenStack infrastructure proper. However, OpenStack does not
+guarantee 99.99% availability for individual guest instances.
+
+This document discusses some common methods of implementing highly
+available systems, with an emphasis on the core OpenStack services and
+other open source services that are closely aligned with OpenStack.
+
+You will need to address high availability concerns for any applications
+software that you run on your OpenStack environment. The important thing is
+to make sure that your services are redundant and available.
+How you achieve that is up to you.
+
+Active/passive versus active/active
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Stateful services can be configured as active/passive or active/active,
+which are defined as follows:
+
+:term:`active/passive configuration`
+  Maintains a redundant instance
+  that can be brought online when the active service fails.
+  For example, OpenStack writes to the main database
+  while maintaining a disaster recovery database that can be brought online
+  if the main database fails.
+
+  A typical active/passive installation for a stateful service maintains
+  a replacement resource that can be brought online when required.
+  Requests are handled using a :term:`virtual IP address (VIP)` that
+  facilitates returning to service with minimal reconfiguration.
+  A separate application (such as Pacemaker or Corosync) monitors
+  these services, bringing the backup online as necessary.
+
+:term:`active/active configuration`
+  Each service also has a backup but manages both the main and
+  redundant systems concurrently.
+  This way, if there is a failure, the user is unlikely to notice.
+  The backup system is already online and takes on increased load
+  while the main system is fixed and brought back online.
+
+  Typically, an active/active installation for a stateless service
+  maintains a redundant instance, and requests are load balanced using
+  a virtual IP address and a load balancer such as HAProxy.
+
+  A typical active/active installation for a stateful service includes
+  redundant services, with all instances having an identical state. In
+  other words, updates to one instance of a database update all other
+  instances. This way a request to one instance is the same as a
+  request to any other. A load balancer manages the traffic to these
+  systems, ensuring that operational systems always handle the
+  request.
+
+Clusters and quorums
+~~~~~~~~~~~~~~~~~~~~
+
+The quorum specifies the minimal number of nodes
+that must be functional in a cluster of redundant nodes
+in order for the cluster to remain functional.
+When one node fails and failover transfers control to other nodes,
+the system must ensure that data and processes remain sane.
+To determine this, the contents of the remaining nodes are compared
+and, if there are discrepancies, a majority rules algorithm is implemented.
+
+For this reason, each cluster in a high availability environment should
+have an odd number of nodes and the quorum is defined as more than a half
+of the nodes.
+If multiple nodes fail so that the cluster size falls below the quorum
+value, the cluster itself fails.
+
+For example, in a seven-node cluster, the quorum should be set to
+``floor(7/2) + 1 == 4``. If quorum is four and four nodes fail simultaneously,
+the cluster itself would fail, whereas it would continue to function, if
+no more than three nodes fail. If split to partitions of three and four nodes
+respectively, the quorum of four nodes would continue to operate the majority
+partition and stop or fence the minority one (depending on the
+no-quorum-policy cluster configuration).
+
+And the quorum could also have been set to three, just as a configuration
+example.
+
+.. note::
+
+  We do not recommend setting the quorum to a value less than ``floor(n/2) + 1``
+  as it would likely cause a split-brain in a face of network partitions.
+
+When four nodes fail simultaneously, the cluster would continue to function as
+well. But if split to partitions of three and four nodes respectively, the
+quorum of three would have made both sides to attempt to fence the other and
+host resources. Without fencing enabled, it would go straight to running
+two copies of each resource.
+
+This is why setting the quorum to a value less than ``floor(n/2) + 1`` is
+dangerous. However it may be required for some specific cases, such as a
+temporary measure at a point it is known with 100% certainty that the other
+nodes are down.
+
+When configuring an OpenStack environment for study or demonstration purposes,
+it is possible to turn off the quorum checking. Production systems should
+always run with quorum enabled.
+
+Load balancing
+~~~~~~~~~~~~~~
+
+.. to do: definition and description of need within HA
diff --git a/doc/ha-guide-draft/source/intro-ha.rst b/doc/ha-guide-draft/source/intro-ha.rst
new file mode 100644
index 0000000000..d798c46c3c
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-ha.rst
@@ -0,0 +1,24 @@
+=================================
+Introduction to high availability
+=================================
+
+High availability systems seek to minimize the following issues:
+
+#. System downtime: Occurs when a user-facing service is unavailable
+   beyond a specified maximum amount of time.
+
+#. Data loss: Accidental deletion or destruction of data.
+
+Most high availability systems guarantee protection against system downtime
+and data loss only in the event of a single failure.
+However, they are also expected to protect against cascading failures,
+where a single failure deteriorates into a series of consequential failures.
+Many service providers guarantee a :term:`Service Level Agreement (SLA)`
+including uptime percentage of computing service, which is calculated based
+on the available time and system downtime excluding planned outage time.
+
+.. toctree::
+   :maxdepth: 2
+
+   intro-ha-key-concepts.rst
+   intro-ha-common-tech.rst
diff --git a/doc/ha-guide-draft/source/intro-os-ha-cluster.rst b/doc/ha-guide-draft/source/intro-os-ha-cluster.rst
new file mode 100644
index 0000000000..555ee2631d
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha-cluster.rst
@@ -0,0 +1,67 @@
+================
+Cluster managers
+================
+
+At its core, a cluster is a distributed finite state machine capable
+of co-ordinating the startup and recovery of inter-related services
+across a set of machines.
+
+Even a distributed or replicated application that is able to survive failures
+on one or more machines can benefit from a cluster manager because a cluster
+manager has the following capabilities:
+
+#. Awareness of other applications in the stack
+
+   While SYS-V init replacements like systemd can provide
+   deterministic recovery of a complex stack of services, the
+   recovery is limited to one machine and lacks the context of what
+   is happening on other machines. This context is crucial to
+   determine the difference between a local failure, and clean startup
+   and recovery after a total site failure.
+
+#. Awareness of instances on other machines
+
+   Services like RabbitMQ and Galera have complicated boot-up
+   sequences that require co-ordination, and often serialization, of
+   startup operations across all machines in the cluster. This is
+   especially true after a site-wide failure or shutdown where you must
+   first determine the last machine to be active.
+
+#. A shared implementation and calculation of `quorum
+   <https://en.wikipedia.org/wiki/Quorum_(Distributed_Systems)>`_
+
+   It is very important that all members of the system share the same
+   view of who their peers are and whether or not they are in the
+   majority. Failure to do this leads very quickly to an internal
+   `split-brain <https://en.wikipedia.org/wiki/Split-brain_(computing)>`_
+   state. This is where different parts of the system are pulling in
+   different and incompatible directions.
+
+#. Data integrity through fencing (a non-responsive process does not
+   imply it is not doing anything)
+
+   A single application does not have sufficient context to know the
+   difference between failure of a machine and failure of the
+   application on a machine. The usual practice is to assume the
+   machine is dead and continue working, however this is highly risky. A
+   rogue process or machine could still be responding to requests and
+   generally causing havoc. The safer approach is to make use of
+   remotely accessible power switches and/or network switches and SAN
+   controllers to fence (isolate) the machine before continuing.
+
+#. Automated recovery of failed instances
+
+   While the application can still run after the failure of several
+   instances, it may not have sufficient capacity to serve the
+   required volume of requests. A cluster can automatically recover
+   failed instances to prevent additional load induced failures.
+
+Pacemaker
+~~~~~~~~~
+.. to do: description and point to ref arch example using pacemaker
+
+`Pacemaker <http://clusterlabs.org>`_.
+
+Systemd
+~~~~~~~
+.. to do: description and point to ref arch example using Systemd and link
diff --git a/doc/ha-guide-draft/source/intro-os-ha-memcached.rst b/doc/ha-guide-draft/source/intro-os-ha-memcached.rst
new file mode 100644
index 0000000000..709c891199
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha-memcached.rst
@@ -0,0 +1,35 @@
+=========
+Memcached
+=========
+
+Most OpenStack services can use Memcached to store ephemeral data such as
+tokens. Although Memcached does not support typical forms of redundancy such
+as clustering, OpenStack services can use almost any number of instances
+by configuring multiple hostnames or IP addresses.
+
+The Memcached client implements hashing to balance objects among the instances.
+Failure of an instance only impacts a percentage of the objects,
+and the client automatically removes it from the list of instances.
+
+Installation
+~~~~~~~~~~~~
+
+To install and configure Memcached, read the
+`official documentation <https://github.com/Memcached/Memcached/wiki#getting-started>`_.
+
+Memory caching is managed by `oslo.cache
+<http://specs.openstack.org/openstack/oslo-specs/specs/kilo/oslo-cache-using-dogpile.html>`_.
+This ensures consistency across all projects when using multiple Memcached
+servers. The following is an example configuration with three hosts:
+
+.. code-block:: ini
+
+  Memcached_servers = controller1:11211,controller2:11211,controller3:11211
+
+By default, ``controller1`` handles the caching service. If the host goes down,
+``controller2`` or ``controller3`` will complete the service.
+
+For more information about Memcached installation, see the
+*Environment -> Memcached* section in the
+`Installation Tutorials and Guides <https://docs.openstack.org/project-install-guide/ocata>`_
+depending on your distribution.
diff --git a/doc/ha-guide-draft/source/intro-os-ha-state.rst b/doc/ha-guide-draft/source/intro-os-ha-state.rst
new file mode 100644
index 0000000000..ba7703e844
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha-state.rst
@@ -0,0 +1,52 @@
+==================================
+Stateless versus stateful services
+==================================
+
+OpenStack components can be divided into three categories:
+
+- OpenStack APIs: APIs that are HTTP(s) stateless services written in python,
+  easy to duplicate and mostly easy to load balance.
+
+- The SQL relational database server provides stateful type consumed by other
+  components. Supported databases are MySQL, MariaDB, and PostgreSQL.
+  Making the SQL database redundant is complex.
+
+- :term:`Advanced Message Queuing Protocol (AMQP)` provides OpenStack
+  internal stateful communication service.
+
+.. to do: Ensure the difference between stateless and stateful services
+.. is clear
+
+Stateless services
+~~~~~~~~~~~~~~~~~~
+
+A service that provides a response after your request and then
+requires no further attention. To make a stateless service highly
+available, you need to provide redundant instances and load balance them.
+
+Stateless OpenStack services
+----------------------------
+
+OpenStack services that are stateless include ``nova-api``,
+``nova-conductor``, ``glance-api``, ``keystone-api``, ``neutron-api``,
+and ``nova-scheduler``.
+
+Stateful services
+~~~~~~~~~~~~~~~~~
+
+A service where subsequent requests to the service
+depend on the results of the first request.
+Stateful services are more difficult to manage because a single
+action typically involves more than one request. Providing
+additional instances and load balancing does not solve the problem.
+For example, if the horizon user interface reset itself every time
+you went to a new page, it would not be very useful.
+OpenStack services that are stateful include the OpenStack database
+and message queue.
+Making stateful services highly available can depend on whether you choose
+an active/passive or active/active configuration.
+
+Stateful OpenStack services
+----------------------------
+
+.. to do: create list of stateful services
diff --git a/doc/ha-guide-draft/source/intro-os-ha.rst b/doc/ha-guide-draft/source/intro-os-ha.rst
new file mode 100644
index 0000000000..5613122aed
--- /dev/null
+++ b/doc/ha-guide-draft/source/intro-os-ha.rst
@@ -0,0 +1,12 @@
+================================================
+Introduction to high availability with OpenStack
+================================================
+
+.. to do: description of section & improvement of title (intro to OS HA)
+
+.. toctree::
+   :maxdepth: 2
+
+   intro-os-ha-state.rst
+   intro-os-ha-cluster.rst
+   intro-os-ha-memcached.rst
diff --git a/doc/ha-guide-draft/source/monitoring.rst b/doc/ha-guide-draft/source/monitoring.rst
new file mode 100644
index 0000000000..a1b132774f
--- /dev/null
+++ b/doc/ha-guide-draft/source/monitoring.rst
@@ -0,0 +1,6 @@
+==========
+Monitoring
+==========
+
+
+
diff --git a/doc/ha-guide-draft/source/networking-ha-l3-agent.rst b/doc/ha-guide-draft/source/networking-ha-l3-agent.rst
new file mode 100644
index 0000000000..5a6370ae1c
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha-l3-agent.rst
@@ -0,0 +1,20 @@
+========
+L3 Agent
+========
+.. TODO: Introduce L3 agent
+
+HA Routers
+~~~~~~~~~~
+.. TODO: content for HA routers
+
+Networking DHCP agent
+~~~~~~~~~~~~~~~~~~~~~
+The OpenStack Networking (neutron) service has a scheduler that lets you run
+multiple agents across nodes. The DHCP agent can be natively highly available.
+
+To configure the number of DHCP agents per network, modify the
+``dhcp_agents_per_network`` parameter in the :file:`/etc/neutron/neutron.conf`
+file. By default this is set to 1. To achieve high availability, assign more
+than one DHCP agent per network. For more information, see
+`High-availability for DHCP
+<https://docs.openstack.org/newton/networking-guide/config-dhcp-ha.html>`_.
diff --git a/doc/ha-guide-draft/source/networking-ha-neutron-l3-analysis.rst b/doc/ha-guide-draft/source/networking-ha-neutron-l3-analysis.rst
new file mode 100644
index 0000000000..7a803132f7
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha-neutron-l3-analysis.rst
@@ -0,0 +1,6 @@
+==========
+Neutron L3
+==========
+
+.. TODO: create and import Neutron L3 analysis
+   Introduce the Networking (neutron) service L3 agent
diff --git a/doc/ha-guide-draft/source/networking-ha-neutron-server.rst b/doc/ha-guide-draft/source/networking-ha-neutron-server.rst
new file mode 100644
index 0000000000..646eb66aa4
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha-neutron-server.rst
@@ -0,0 +1,5 @@
+=========================
+Neutron Networking server
+=========================
+
+.. TODO: Create content similar to other API sections
diff --git a/doc/ha-guide-draft/source/networking-ha.rst b/doc/ha-guide-draft/source/networking-ha.rst
new file mode 100644
index 0000000000..0767455056
--- /dev/null
+++ b/doc/ha-guide-draft/source/networking-ha.rst
@@ -0,0 +1,29 @@
+===================================
+Configuring the networking services
+===================================
+
+Configure networking on each node. See the basic information about
+configuring networking in the Networking service section of the
+`Install Tutorials and Guides <https://docs.openstack.org/project-install-guide/ocata/>`_,
+depending on your distribution.
+
+OpenStack network nodes contain:
+
+- Networking DHCP agent
+- Neutron L3 agent
+- Networking L2 agent
+
+.. note::
+
+   The L2 agent cannot be distributed and highly available. Instead, it
+   must be installed on each data forwarding node to control the virtual
+   network driver such as Open vSwitch or Linux Bridge. One L2 agent runs
+   per node and controls its virtual interfaces.
+
+.. toctree::
+   :maxdepth: 2
+
+   networking-ha-neutron-server.rst
+   networking-ha-neutron-l3-analysis.rst
+   networking-ha-l3-agent.rst
+
diff --git a/doc/ha-guide-draft/source/overview.rst b/doc/ha-guide-draft/source/overview.rst
new file mode 100644
index 0000000000..7b64054e21
--- /dev/null
+++ b/doc/ha-guide-draft/source/overview.rst
@@ -0,0 +1,24 @@
+========
+Overview
+========
+
+This guide can be split into two parts:
+
+#. High level architecture
+#. Reference architecture examples, monitoring, and testing
+
+.. warning::
+   We recommend using this guide for assistance when considering your HA cloud.
+   We do not recommend using this guide for manually building your HA cloud.
+   We recommend starting with a pre-validated solution and adjusting to your
+   needs.
+
+High availability is not for every user. It presents some challenges.
+High availability may be too complex for databases or
+systems with large amounts of data. Replication can slow large systems
+down. Different setups have different prerequisites. Read the guidelines
+for each setup.
+
+.. important::
+
+   High availability is turned off as the default in OpenStack setups.
diff --git a/doc/ha-guide-draft/source/ref-arch-examples.rst b/doc/ha-guide-draft/source/ref-arch-examples.rst
new file mode 100644
index 0000000000..dc842f3f53
--- /dev/null
+++ b/doc/ha-guide-draft/source/ref-arch-examples.rst
@@ -0,0 +1,3 @@
+======================
+Reference Architecture
+======================
diff --git a/doc/ha-guide-draft/source/storage-ha-backend.rst b/doc/ha-guide-draft/source/storage-ha-backend.rst
new file mode 100644
index 0000000000..8148b5287a
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-backend.rst
@@ -0,0 +1,59 @@
+
+.. _storage-ha-backend:
+
+================
+Storage back end
+================
+
+An OpenStack environment includes multiple data pools for the VMs:
+
+- Ephemeral storage is allocated for an instance and is deleted when the
+  instance is deleted. The Compute service manages ephemeral storage and
+  by default, Compute stores ephemeral drives as files on local disks on the
+  compute node. As an alternative, you can use Ceph RBD as the storage back
+  end for ephemeral storage.
+
+- Persistent storage exists outside all instances. Two types of persistent
+  storage are provided:
+
+  - The Block Storage service (cinder) that can use LVM or Ceph RBD as the
+    storage back end.
+  - The Image service (glance) that can use the Object Storage service (swift)
+    or Ceph RBD as the storage back end.
+
+For more information about configuring storage back ends for
+the different storage options, see `Manage volumes
+<https://docs.openstack.org/admin-guide/blockstorage-manage-volumes.html>`_
+in the OpenStack Administrator Guide.
+
+This section discusses ways to protect against data loss in your OpenStack
+environment.
+
+RAID drives
+-----------
+
+Configuring RAID on the hard drives that implement storage protects your data
+against a hard drive failure. If the node itself fails, data may be lost.
+In particular, all volumes stored on an LVM node can be lost.
+
+Ceph
+----
+
+`Ceph RBD <http://ceph.com/>`_ is an innately high availability storage back
+end. It creates a storage cluster with multiple nodes that communicate with
+each other to replicate and redistribute data dynamically.
+A Ceph RBD storage cluster provides a single shared set of storage nodes that
+can handle all classes of persistent and ephemeral data (glance, cinder, and
+nova) that are required for OpenStack instances.
+
+Ceph RBD provides object replication capabilities by storing Block Storage
+volumes as Ceph RBD objects. Ceph RBD ensures that each replica of an object
+is stored on a different node. This means that your volumes are protected
+against hard drive and node failures, or even the failure of the data center
+itself.
+
+When Ceph RBD is used for ephemeral volumes as well as block and image storage,
+it supports `live migration
+<https://docs.openstack.org/admin-guide/compute-live-migration-usage.html>`_
+of VMs with ephemeral drives. LVM only supports live migration of
+volume-backed VMs.
diff --git a/doc/ha-guide-draft/source/storage-ha-block.rst b/doc/ha-guide-draft/source/storage-ha-block.rst
new file mode 100644
index 0000000000..a9000cbfa5
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-block.rst
@@ -0,0 +1,190 @@
+==================================
+Highly available Block Storage API
+==================================
+
+Cinder provides Block-Storage-as-a-Service suitable for performance
+sensitive scenarios such as databases, expandable file systems, or
+providing a server with access to raw block level storage.
+
+Persistent block storage can survive instance termination and can also
+be moved across instances like any external storage device. Cinder
+also has volume snapshots capability for backing up the volumes.
+
+Making the Block Storage API service highly available in
+active/passive mode involves:
+
+- :ref:`ha-blockstorage-pacemaker`
+- :ref:`ha-blockstorage-configure`
+- :ref:`ha-blockstorage-services`
+
+In theory, you can run the Block Storage service as active/active.
+However, because of sufficient concerns, we recommend running
+the volume component as active/passive only.
+
+You can read more about these concerns on the
+`Red Hat Bugzilla <https://bugzilla.redhat.com/show_bug.cgi?id=1193229>`_
+and there is a
+`psuedo roadmap <https://etherpad.openstack.org/p/cinder-kilo-stabilisation-work>`_
+for addressing them upstream.
+
+.. _ha-blockstorage-pacemaker:
+
+Add Block Storage API resource to Pacemaker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On RHEL-based systems, create resources for cinder's systemd agents and create
+constraints to enforce startup/shutdown ordering:
+
+.. code-block:: console
+
+  pcs resource create openstack-cinder-api systemd:openstack-cinder-api --clone interleave=true
+  pcs resource create openstack-cinder-scheduler systemd:openstack-cinder-scheduler --clone interleave=true
+  pcs resource create openstack-cinder-volume systemd:openstack-cinder-volume
+
+  pcs constraint order start openstack-cinder-api-clone then openstack-cinder-scheduler-clone
+  pcs constraint colocation add openstack-cinder-scheduler-clone with openstack-cinder-api-clone
+  pcs constraint order start openstack-cinder-scheduler-clone then openstack-cinder-volume
+  pcs constraint colocation add openstack-cinder-volume with openstack-cinder-scheduler-clone
+
+
+If the Block Storage service runs on the same nodes as the other services,
+then it is advisable to also include:
+
+.. code-block:: console
+
+   pcs constraint order start openstack-keystone-clone then openstack-cinder-api-clone
+
+Alternatively, instead of using systemd agents, download and
+install the OCF resource agent:
+
+.. code-block:: console
+
+   # cd /usr/lib/ocf/resource.d/openstack
+   # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/cinder-api
+   # chmod a+rx *
+
+You can now add the Pacemaker configuration for Block Storage API resource.
+Connect to the Pacemaker cluster with the :command:`crm configure` command
+and add the following cluster resources:
+
+.. code-block:: none
+
+   primitive p_cinder-api ocf:openstack:cinder-api \
+      params config="/etc/cinder/cinder.conf" \
+      os_password="secretsecret" \
+      os_username="admin" \
+      os_tenant_name="admin" \
+      keystone_get_token_url="http://10.0.0.11:5000/v2.0/tokens" \
+      op monitor interval="30s" timeout="30s"
+
+This configuration creates ``p_cinder-api``, a resource for managing the
+Block Storage API service.
+
+The command :command:`crm configure` supports batch input, copy and paste the
+lines above into your live Pacemaker configuration and then make changes as
+required. For example, you may enter ``edit p_ip_cinder-api`` from the
+:command:`crm configure` menu and edit the resource to match your preferred
+virtual IP address.
+
+Once completed, commit your configuration changes by entering :command:`commit`
+from the :command:`crm configure` menu. Pacemaker then starts the Block Storage
+API service and its dependent resources on one of your nodes.
+
+.. _ha-blockstorage-configure:
+
+Configure Block Storage API service
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Edit the ``/etc/cinder/cinder.conf`` file. For example, on a RHEL-based system:
+
+.. code-block:: ini
+   :linenos:
+
+   [DEFAULT]
+   # This is the name which we should advertise ourselves as and for
+   # A/P installations it should be the same everywhere
+   host = cinder-cluster-1
+
+   # Listen on the Block Storage VIP
+   osapi_volume_listen = 10.0.0.11
+
+   auth_strategy = keystone
+   control_exchange = cinder
+
+   volume_driver = cinder.volume.drivers.nfs.NfsDriver
+   nfs_shares_config = /etc/cinder/nfs_exports
+   nfs_sparsed_volumes = true
+   nfs_mount_options = v3
+
+   [database]
+   sql_connection = mysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
+   max_retries = -1
+
+   [keystone_authtoken]
+   # 10.0.0.11 is the Keystone VIP
+   identity_uri = http://10.0.0.11:35357/
+   auth_uri = http://10.0.0.11:5000/
+   admin_tenant_name = service
+   admin_user = cinder
+   admin_password = CINDER_PASS
+
+   [oslo_messaging_rabbit]
+   # Explicitly list the rabbit hosts as it doesn't play well with HAProxy
+   rabbit_hosts = 10.0.0.12,10.0.0.13,10.0.0.14
+   # As a consequence, we also need HA queues
+   rabbit_ha_queues = True
+   heartbeat_timeout_threshold = 60
+   heartbeat_rate = 2
+
+Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
+database. Replace ``CINDER_PASS`` with the password you chose for the
+``cinder`` user in the Identity service.
+
+This example assumes that you are using NFS for the physical storage, which
+will almost never be true in a production installation.
+
+If you are using the Block Storage service OCF agent, some settings will
+be filled in for you, resulting in a shorter configuration file:
+
+.. code-block:: ini
+   :linenos:
+
+   # We have to use MySQL connection to store data:
+   sql_connection = mysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
+   # Alternatively, you can switch to pymysql,
+   # a new Python 3 compatible library and use
+   # sql_connection = mysql+pymysql://cinder:CINDER_DBPASS@10.0.0.11/cinder
+   # and be ready when everything moves to Python 3.
+   # Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
+
+   # We bind Block Storage API to the VIP:
+   osapi_volume_listen = 10.0.0.11
+
+   # We send notifications to High Available RabbitMQ:
+   notifier_strategy = rabbit
+   rabbit_host = 10.0.0.11
+
+Replace ``CINDER_DBPASS`` with the password you chose for the Block Storage
+database.
+
+.. _ha-blockstorage-services:
+
+Configure OpenStack services to use the highly available Block Storage API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your OpenStack services must now point their Block Storage API configuration
+to the highly available, virtual cluster IP address rather than a Block Storage
+API server’s physical IP address as you would for a non-HA environment.
+
+Create the Block Storage API endpoint with this IP.
+
+If you are using both private and public IP addresses, create two virtual IPs
+and define your endpoint. For example:
+
+.. code-block:: console
+
+   $ openstack endpoint create volume --region $KEYSTONE_REGION \
+   --publicurl 'http://PUBLIC_VIP:8776/v1/%(tenant_id)s' \
+   --adminurl 'http://10.0.0.11:8776/v1/%(tenant_id)s' \
+   --internalurl 'http://10.0.0.11:8776/v1/%(tenant_id)s'
+
diff --git a/doc/ha-guide-draft/source/storage-ha-file-systems.rst b/doc/ha-guide-draft/source/storage-ha-file-systems.rst
new file mode 100644
index 0000000000..5ef3e2e8a5
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-file-systems.rst
@@ -0,0 +1,114 @@
+========================================
+Highly available Shared File Systems API
+========================================
+
+Making the Shared File Systems (manila) API service highly available
+in active/passive mode involves:
+
+- :ref:`ha-sharedfilesystems-configure`
+- :ref:`ha-sharedfilesystems-services`
+- :ref:`ha-sharedfilesystems-pacemaker`
+
+.. _ha-sharedfilesystems-configure:
+
+Configure Shared File Systems API service
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Edit the :file:`/etc/manila/manila.conf` file:
+
+.. code-block:: ini
+   :linenos:
+
+   # We have to use MySQL connection to store data:
+   sql_connection = mysql+pymysql://manila:password@10.0.0.11/manila?charset=utf8
+
+   # We bind Shared File Systems API to the VIP:
+   osapi_volume_listen = 10.0.0.11
+
+   # We send notifications to High Available RabbitMQ:
+   notifier_strategy = rabbit
+   rabbit_host = 10.0.0.11
+
+
+.. _ha-sharedfilesystems-services:
+
+Configure OpenStack services to use Shared File Systems API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your OpenStack services must now point their Shared File Systems API
+configuration to the highly available, virtual cluster IP address rather than
+a Shared File Systems API server’s physical IP address as you would
+for a non-HA environment.
+
+You must create the Shared File Systems API endpoint with this IP.
+
+If you are using both private and public IP addresses, you should create two
+virtual IPs and define your endpoints like this:
+
+.. code-block:: console
+
+   $ openstack endpoint create --region RegionOne \
+     sharev2 public 'http://PUBLIC_VIP:8786/v2/%(tenant_id)s'
+
+   $ openstack endpoint create --region RegionOne \
+     sharev2 internal 'http://10.0.0.11:8786/v2/%(tenant_id)s'
+
+   $ openstack endpoint create --region RegionOne \
+     sharev2 admin 'http://10.0.0.11:8786/v2/%(tenant_id)s'
+
+.. _ha-sharedfilesystems-pacemaker:
+
+Add Shared File Systems API resource to Pacemaker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Download the resource agent to your system:
+
+   .. code-block:: console
+
+      # cd /usr/lib/ocf/resource.d/openstack
+      # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/manila-api
+      # chmod a+rx *
+
+#. Add the Pacemaker configuration for the Shared File Systems
+   API resource. Connect to the Pacemaker cluster with the following
+   command:
+
+   .. code-block:: console
+
+      # crm configure
+
+   .. note::
+
+      The :command:`crm configure` supports batch input. Copy and paste
+      the lines in the next step into your live Pacemaker configuration and then
+      make changes as required.
+
+      For example, you may enter ``edit p_ip_manila-api`` from the
+      :command:`crm configure` menu and edit the resource to match your preferred
+      virtual IP address.
+
+#. Add the following cluster resources:
+
+   .. code-block:: none
+
+      primitive p_manila-api ocf:openstack:manila-api \
+        params config="/etc/manila/manila.conf" \
+        os_password="secretsecret" \
+        os_username="admin" \
+        os_tenant_name="admin" \
+        keystone_get_token_url="http://10.0.0.11:5000/v2.0/tokens" \
+        op monitor interval="30s" timeout="30s"
+
+   This configuration creates ``p_manila-api``, a resource for managing the
+   Shared File Systems API service.
+
+#. Commit your configuration changes by entering the following command
+   from the :command:`crm configure` menu:
+
+   .. code-block:: console
+
+      # commit
+
+Pacemaker now starts the Shared File Systems API service and its
+dependent resources on one of your nodes.
+
diff --git a/doc/ha-guide-draft/source/storage-ha-image.rst b/doc/ha-guide-draft/source/storage-ha-image.rst
new file mode 100644
index 0000000000..362c65c5ac
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha-image.rst
@@ -0,0 +1,141 @@
+==========================
+Highly available Image API
+==========================
+
+The OpenStack Image service offers a service for discovering, registering, and
+retrieving virtual machine images. To make the OpenStack Image API service
+highly available in active/passive mode, you must:
+
+- :ref:`glance-api-pacemaker`
+- :ref:`glance-api-configure`
+- :ref:`glance-services`
+
+Prerequisites
+~~~~~~~~~~~~~
+
+Before beginning, ensure that you are familiar with the
+documentation for installing the OpenStack Image API service.
+See the *Image service* section in the
+`Installation Tutorials and Guides <https://docs.openstack.org/project-install-guide/ocata>`_,
+depending on your distribution.
+
+.. _glance-api-pacemaker:
+
+Add OpenStack Image API resource to Pacemaker
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Download the resource agent to your system:
+
+   .. code-block:: console
+
+      # cd /usr/lib/ocf/resource.d/openstack
+      # wget https://git.openstack.org/cgit/openstack/openstack-resource-agents/plain/ocf/glance-api
+      # chmod a+rx *
+
+#. Add the Pacemaker configuration for the OpenStack Image API resource.
+   Use the following command to connect to the Pacemaker cluster:
+
+   .. code-block:: console
+
+      crm configure
+
+   .. note::
+
+      The :command:`crm configure` command supports batch input. Copy and paste
+      the lines in the next step into your live Pacemaker configuration and
+      then make changes as required.
+
+      For example, you may enter ``edit p_ip_glance-api`` from the
+      :command:`crm configure` menu and edit the resource to match your
+      preferred virtual IP address.
+
+#. Add the following cluster resources:
+
+   .. code-block:: console
+
+      primitive p_glance-api ocf:openstack:glance-api \
+        params config="/etc/glance/glance-api.conf" \
+        os_password="secretsecret" \
+        os_username="admin" os_tenant_name="admin" \
+        os_auth_url="http://10.0.0.11:5000/v2.0/" \
+        op monitor interval="30s" timeout="30s"
+
+   This configuration creates ``p_glance-api``, a resource for managing the
+   OpenStack Image API service.
+
+#. Commit your configuration changes by entering the following command from
+   the :command:`crm configure` menu:
+
+   .. code-block:: console
+
+      commit
+
+Pacemaker then starts the OpenStack Image API service and its dependent
+resources on one of your nodes.
+
+.. _glance-api-configure:
+
+Configure OpenStack Image service API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Edit the :file:`/etc/glance/glance-api.conf` file
+to configure the OpenStack Image service:
+
+.. code-block:: ini
+
+   # We have to use MySQL connection to store data:
+   sql_connection=mysql://glance:password@10.0.0.11/glance
+   # Alternatively, you can switch to pymysql,
+   # a new Python 3 compatible library and use
+   # sql_connection=mysql+pymysql://glance:password@10.0.0.11/glance
+   # and be ready when everything moves to Python 3.
+   # Ref: https://wiki.openstack.org/wiki/PyMySQL_evaluation
+
+   # We bind OpenStack Image API to the VIP:
+   bind_host = 10.0.0.11
+
+   # Connect to OpenStack Image registry service:
+   registry_host = 10.0.0.11
+
+   # We send notifications to High Available RabbitMQ:
+   notifier_strategy = rabbit
+   rabbit_host = 10.0.0.11
+
+[TODO: need more discussion of these parameters]
+
+.. _glance-services:
+
+Configure OpenStack services to use the highly available OpenStack Image API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Your OpenStack services must now point their OpenStack Image API configuration
+to the highly available, virtual cluster IP address instead of pointing to the
+physical IP address of an OpenStack Image API server as you would in a non-HA
+cluster.
+
+For example, if your OpenStack Image API service IP address is 10.0.0.11
+(as in the configuration explained here), you would use the following
+configuration in your :file:`nova.conf` file:
+
+.. code-block:: ini
+
+   [glance]
+   # ...
+   api_servers = 10.0.0.11
+   # ...
+
+
+You must also create the OpenStack Image API endpoint with this IP address.
+If you are using both private and public IP addresses, create two virtual IP
+addresses and define your endpoint. For example:
+
+.. code-block:: console
+
+   $ openstack endpoint create --region $KEYSTONE_REGION \
+     image public http://PUBLIC_VIP:9292
+
+   $ openstack endpoint create --region $KEYSTONE_REGION \
+     image admin http://10.0.0.11:9292
+
+   $ openstack endpoint create --region $KEYSTONE_REGION \
+     image internal http://10.0.0.11:9292
diff --git a/doc/ha-guide-draft/source/storage-ha.rst b/doc/ha-guide-draft/source/storage-ha.rst
new file mode 100644
index 0000000000..22ea30c492
--- /dev/null
+++ b/doc/ha-guide-draft/source/storage-ha.rst
@@ -0,0 +1,22 @@
+===================
+Configuring storage
+===================
+
+.. toctree::
+   :maxdepth: 2
+
+   storage-ha-image.rst
+   storage-ha-block.rst
+   storage-ha-file-systems.rst
+   storage-ha-backend.rst
+
+Making the Block Storage (cinder) API service highly available in
+active/active mode involves:
+
+* Configuring Block Storage to listen on the VIP address
+
+* Managing the Block Storage API daemon with the Pacemaker cluster manager
+
+* Configuring OpenStack services to use this IP address
+
+.. To Do: HA without Pacemaker
diff --git a/doc/ha-guide-draft/source/testing.rst b/doc/ha-guide-draft/source/testing.rst
new file mode 100644
index 0000000000..3cb8110302
--- /dev/null
+++ b/doc/ha-guide-draft/source/testing.rst
@@ -0,0 +1,6 @@
+=======
+Testing
+=======
+
+
+
diff --git a/tools/build-all-rst.sh b/tools/build-all-rst.sh
index beb747ac6e..e4571abdaf 100755
--- a/tools/build-all-rst.sh
+++ b/tools/build-all-rst.sh
@@ -33,7 +33,7 @@ PDF_TARGETS=( 'arch-design'\
 
 # Note that these guides are only build for master branch
 for guide in admin-guide arch-design cli-reference contributor-guide \
-    ha-guide image-guide ops-guide user-guide; do
+    ha-guide ha-guide-draft image-guide ops-guide user-guide; do
     if [[ ${PDF_TARGETS[*]} =~ $guide ]]; then
         tools/build-rst.sh doc/$guide --build build \
             --target $guide $LINKCHECK $PDF_OPTION
@@ -46,7 +46,7 @@ done
 # Draft guides
 # This includes guides that we publish from stable branches
 # as versioned like the networking-guide.
-for guide in networking-guide config-reference; do
+for guide in ha-guide-draft networking-guide config-reference; do
     if [[ ${PDF_TARGETS[*]} =~ $guide ]]; then
         tools/build-rst.sh doc/$guide --build build \
             --target "draft/$guide" $LINKCHECK $PDF_OPTION
diff --git a/www/draft/draft-index.html b/www/draft/draft-index.html
index 30071f41f3..cc6d5ac233 100644
--- a/www/draft/draft-index.html
+++ b/www/draft/draft-index.html
@@ -77,6 +77,7 @@
                     <div class="docs-link-sections">
                         <h3 id="ops-and-admin-guides"><i class="fa fa-users"></i>Draft Operations and Administration Guides</h3>
                         <a href="/draft/networking-guide/">Networking Guide</a>
+                        <a href="/draft/ha-guide-draft/">Draft High Availability Guide</a>
                     </div>
                     <div class="docs-link-sections">
                         <h3 id="api-guides"><i class="fa fa-book"></i>Draft API Guides</h3>