+++ /dev/null
-==============
- Librados (C)
-==============
-
-.. highlight:: c
-
-`Librados` provides low-level access to the RADOS service. For an
-overview of RADOS, see :doc:`/architecture`.
-
-
-Example: connecting and writing an object
-=========================================
-
-To use `Librados`, you instantiate a :c:type:`rados_t` variable (a cluster handle) and
-call :c:func:`rados_create()` with a pointer to it::
-
- int err;
- rados_t cluster;
-
- err = rados_create(&cluster, NULL);
- if (err < 0) {
- fprintf(stderr, "%s: cannot create a cluster handle: %s\n", argv[0], strerror(-err));
- exit(1);
- }
-
-Then you configure your :c:type:`rados_t` to connect to your cluster,
-either by setting individual values (:c:func:`rados_conf_set()`),
-using a configuration file (:c:func:`rados_conf_read_file()`), using
-command line options (:c:func:`rados_conf_parse_argv`), or an
-environment variable (:c:func:`rados_conf_parse_env()`)::
-
- err = rados_conf_read_file(cluster, "/path/to/myceph.conf");
- if (err < 0) {
- fprintf(stderr, "%s: cannot read config file: %s\n", argv[0], strerror(-err));
- exit(1);
- }
-
-Once the cluster handle is configured, you can connect to the cluster with :c:func:`rados_connect()`::
-
- err = rados_connect(cluster);
- if (err < 0) {
- fprintf(stderr, "%s: cannot connect to cluster: %s\n", argv[0], strerror(-err));
- exit(1);
- }
-
-Then you open an "IO context", a :c:type:`rados_ioctx_t`, with :c:func:`rados_ioctx_create()`::
-
- rados_ioctx_t io;
- char *poolname = "mypool";
-
- err = rados_ioctx_create(cluster, poolname, &io);
- if (err < 0) {
- fprintf(stderr, "%s: cannot open rados pool %s: %s\n", argv[0], poolname, strerror(-err));
- rados_shutdown(cluster);
- exit(1);
- }
-
-Note that the pool you try to access must exist.
-
-Then you can use the RADOS data manipulation functions, for example
-write into an object called ``greeting`` with
-:c:func:`rados_write_full()`::
-
- err = rados_write_full(io, "greeting", "hello", 5);
- if (err < 0) {
- fprintf(stderr, "%s: cannot write pool %s: %s\n", argv[0], poolname, strerror(-err));
- rados_ioctx_destroy(io);
- rados_shutdown(cluster);
- exit(1);
- }
-
-In the end, you'll want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`::
-
- rados_ioctx_destroy(io);
- rados_shutdown(cluster);
-
-
-Asychronous IO
-==============
-
-When doing lots of IO, you often don't need to wait for one operation
-to complete before starting the next one. `Librados` provides
-asynchronous versions of several operations:
-
-* :c:func:`rados_aio_write`
-* :c:func:`rados_aio_append`
-* :c:func:`rados_aio_write_full`
-* :c:func:`rados_aio_read`
-
-For each operation, you must first create a
-:c:type:`rados_completion_t` that represents what to do when the
-operation is safe or complete by calling
-:c:func:`rados_aio_create_completion`. If you don't need anything
-special to happen, you can pass NULL::
-
- rados_completion_t comp;
- err = rados_aio_create_completion(NULL, NULL, NULL, &comp);
- if (err < 0) {
- fprintf(stderr, "%s: could not create aio completion: %s\n", argv[0], strerror(-err));
- rados_ioctx_destroy(io);
- rados_shutdown(cluster);
- exit(1);
- }
-
-Now you can call any of the aio operations, and wait for it to
-be in memory or on disk on all replicas::
-
- err = rados_aio_write(io, "foo", comp, "bar", 3, 0);
- if (err < 0) {
- fprintf(stderr, "%s: could not schedule aio write: %s\n", argv[0], strerror(-err));
- rados_aio_release(comp);
- rados_ioctx_destroy(io);
- rados_shutdown(cluster);
- exit(1);
- }
- rados_wait_for_complete(comp); // in memory
- rados_wait_for_safe(comp); // on disk
-
-Finally, we need to free the memory used by the completion with :c:func:`rados_aio_release`::
-
- rados_aio_release(comp);
-
-You can use the callbacks to tell your application when writes are
-durable, or when read buffers are full. For example, if you wanted to
-measure the latency of each operation when appending to several
-objects, you could schedule several writes and store the ack and
-commit time in the corresponding callback, then wait for all of them
-to complete using :c:func:`rados_aio_flush` before analyzing the
-latencies::
-
- typedef struct {
- struct timeval start;
- struct timeval ack_end;
- struct timeval commit_end;
- } req_duration;
-
- void ack_callback(rados_completion_t comp, void *arg) {
- req_duration *dur = (req_duration *) arg;
- gettimeofday(&dur->ack_end, NULL);
- }
-
- void commit_callback(rados_completion_t comp, void *arg) {
- req_duration *dur = (req_duration *) arg;
- gettimeofday(&dur->commit_end, NULL);
- }
-
- int output_append_latency(rados_ioctx_t io, const char *data, size_t len, size_t num_writes) {
- req_duration times[num_writes];
- rados_completion_t comps[num_writes];
- for (size_t i = 0; i < num_writes; ++i) {
- gettimeofday(×[i].start, NULL);
- int err = rados_aio_create_completion((void*) ×[i], ack_callback, commit_callback, &comps[i]);
- if (err < 0) {
- fprintf(stderr, "Error creating rados completion: %s\n", strerror(-err));
- return err;
- }
- char obj_name[100];
- snprintf(obj_name, sizeof(obj_name), "foo%ld", (unsigned long)i);
- err = rados_aio_append(io, obj_name, comps[i], data, len);
- if (err < 0) {
- fprintf(stderr, "Error from rados_aio_append: %s", strerror(-err));
- return err;
- }
- }
- // wait until all requests finish *and* the callbacks complete
- rados_aio_flush(io);
- // the latencies can now be analyzed
- printf("Request # | Ack latency (s) | Commit latency (s)\n");
- for (size_t i = 0; i < num_writes; ++i) {
- // don't forget to free the completions
- rados_aio_release(comps[i]);
- struct timeval ack_lat, commit_lat;
- timersub(×[i].ack_end, ×[i].start, &ack_lat);
- timersub(×[i].commit_end, ×[i].start, &commit_lat);
- printf("%9ld | %8ld.%06ld | %10ld.%06ld\n", (unsigned long) i, ack_lat.tv_sec, ack_lat.tv_usec, commit_lat.tv_sec, commit_lat.tv_usec);
- }
- return 0;
- }
-
-Note that all the :c:type:`rados_completion_t` must be freed with :c:func:`rados_aio_release` to avoid leaking memory.
-
-
-API calls
-=========
-
- .. doxygenfile:: librados.h
+++ /dev/null
-==================
- LibradosPP (C++)
-==================
-
-.. todo:: write me!
+++ /dev/null
-==========================
- Adding/Removing Monitors
-==========================
-
-When you have a cluster up and running, you may add or remove monitors
-from the cluster at runtime.
-
-Adding Monitors
-===============
-
-Ceph monitors are light-weight processes that maintain a master copy of the
-cluster map. You can run a cluster with 1 monitor. We recommend at least 3
-monitors for a production cluster. Ceph monitors use PAXOS to establish
-consensus about the master cluster map, which requires a majority of
-monitors running to establish a quorum for consensus about the cluster map
-(e.g., 1; 3 out of 5; 4 out of 6; etc.).
-
-Since monitors are light-weight, it is possible to run them on the same
-host as an OSD; however, we recommend running them on separate hosts.
-
-.. important:: A *majority* of monitors in your cluster must be able to
- reach each other in order to establish a quorum.
-
-Deploy your Hardware
---------------------
-
-If you are adding a new host when adding a new monitor, see `Hardware
-Recommendations`_ for details on minimum recommendations for monitor hardware.
-To add a monitor host to your cluster, first make sure you have an up-to-date
-version of Linux installed (typically Ubuntu 12.04 precise).
-
-Add your monitor host to a rack in your cluster, connect it to the network
-and ensure that it has network connectivity.
-
-.. _Hardware Recommendations: ../../install/hardware-recommendations
-
-Install the Required Software
------------------------------
-
-For manually deployed clusters, you must install Ceph packages
-manually. See `Installing Debian/Ubuntu Packages`_ for details.
-You should configure SSH to a user with password-less authentication
-and root permissions.
-
-.. _Installing Debian/Ubuntu Packages: ../../install/debian
-
-For clusters deployed with Chef, create a `chef user`_, `configure
-SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
-`Installing Chef`_ for details.
-
-.. _chef user: ../../install/chef#createuser
-.. _configure SSH keys: ../../install/chef#genkeys
-.. _install the Chef client: ../../install/chef#installchef
-.. _Installing Chef: ../../install/chef
-.. _install Ruby: ../../install/chef#installruby
-
-.. _adding-mon:
-
-Adding a Monitor (Manual)
--------------------------
-
-This procedure creates a ``ceph-mon`` data directory, retrieves the monitor map
-and monitor keyring, and adds a ``ceph-mon`` daemon to your cluster. If
-this results in only two monitor daemons, you may add more monitors by
-repeating this procedure until you have a sufficient number of ``ceph-mon``
-daemons to achieve a quorum.
-
-#. Create the default directory on your new monitor. ::
-
- ssh {new-mon-host}
- sudo mkdir /var/lib/ceph/mon/ceph-{mon-letter}
-
-#. Create a temporary directory ``{tmp}`` to keep the files needed during
- this process. This directory should be different from monitor's default
- directory created in the previous step, and can be removed after all the
- steps are taken. ::
-
- mkdir {tmp}
-
-#. Retrieve the keyring for your monitors, where ``{tmp}`` is the path to
- the retrieved keyring, and ``{filename}`` is the name of the file containing
- the retrieved monitor key. ::
-
- ceph auth get mon. -o {tmp}/{filename}
-
-#. Retrieve the monitor map, where ``{tmp}`` is the path to
- the retrieved monitor map, and ``{filename}`` is the name of the file
- containing the retrieved monitor monitor map. ::
-
- ceph mon getmap -o {tmp}/{filename}
-
-#. Prepare the monitor's data directory created in the first step. You must
- specify the path to the monitor map so that you can retrieve the
- information about a quorum of monitors and their ``fsid``. You must also
- specify a path to the monitor keyring::
-
- sudo ceph-mon -i {mon-letter} --mkfs --monmap {tmp}/{filename} --keyring {tmp}/{filename}
-
-
-#. Add a ``[mon.{letter}]`` entry for your new monitor in your ``ceph.conf`` file. ::
-
- [mon.c]
- host = new-mon-host
- addr = ip-addr:6789
-
-#. Add the new monitor to the list of monitors for you cluster (runtime). This enables
- other nodes to use this monitor during their initial startup. ::
-
- ceph mon add <name> <ip>[:<port>]
-
-#. Start the new monitor and it will automatically join the cluster.
- The daemon needs to know which address to bind to, either via
- ``--public-addr {ip:port}`` or by setting ``mon addr`` in the
- appropriate section of ``ceph.conf``. For example::
-
- ceph-mon -i newname --public-addr {ip:port}
-
-
-Removing Monitors
-=================
-
-When you remove monitors from a cluster, consider that Ceph monitors use
-PAXOS to establish consensus about the master cluster map. You must have
-a sufficient number of monitors to establish a quorum for consensus about
-the cluster map.
-
-Removing a Monitor (Manual)
----------------------------
-
-This procedure removes a ``ceph-mon`` daemon from your cluster. If this
-procedure results in only two monitor daemons, you may add or remove another
-monitor until you have a number of ``ceph-mon`` daemons that can achieve a
-quorum.
-
-#. Stop the monitor. ::
-
- service ceph -a stop mon.{mon-letter}
-
-#. Remove the monitor from the cluster. ::
-
- ceph mon remove {mon-letter}
-
-#. Remove the monitor entry from ``ceph.conf``.
-
-
-Removing Monitors from an Unhealthy Cluster
--------------------------------------------
-
-This procedure removes a ``ceph-mon`` daemon from an unhealhty cluster--i.e.,
-a cluster that has placement groups that are persistently not ``active + clean``.
-
-
-#. Identify a surviving monitor. ::
-
- ceph mon dump
-
-#. Navigate to a surviving monitor's ``monmap`` directory. ::
-
- ssh {mon-host}
- cd /var/lib/ceph/mon/ceph-{mon-letter}/monmap
-
-#. List the directory contents and identify the last commmitted map.
- Directory contents will show a numeric list of maps. ::
-
- ls
- 1 2 3 4 5 first_committed last_committed last_pn latest
-
-
-#. Identify the most recently committed map. ::
-
- sudo cat last_committed
-
-#. Copy the most recently committed file to a temporary directory. ::
-
- cp /var/lib/ceph/mon/ceph-{mon-letter}/monmap/{last_committed} /tmp/surviving_map
-
-#. Remove the non-surviving monitors. For example, if you have three monitors,
- ``mon.a``, ``mon.b``, and ``mon.c``, where only ``mon.a`` will survive, follow
- the example below::
-
- monmaptool /tmp/surviving_map --rm {mon-letter}
- #for example
- monmaptool /tmp/surviving_map --rm b
- monmaptool /tmp/surviving_map --rm c
-
-#. Stop all monitors. ::
-
- service ceph -a stop mon
-
-#. Inject the surviving map with the removed monitors into the surviving monitors.
- For example, to inject a map into monitor ``mon.a``, follow the example below::
-
- ceph-mon -i {mon-letter} --inject-monmap {map-path}
- #for example
- ceph-mon -i a --inject-monmap /etc/surviving_map
+++ /dev/null
-======================
- Adding/Removing OSDs
-======================
-
-When you have a cluster up and running, you may add OSDs or remove OSDs
-from the cluster at runtime.
-
-Adding OSDs
-===========
-
-When you want to expand a cluster, you may add an OSD at runtime. With Ceph, an
-OSD is generally one Ceph ``ceph-osd`` daemon for one storage disk within a host
-machine. If your host has multiple storage disks, you may map one ``ceph-osd``
-daemon for each disk.
-
-Generally, it's a good idea to check the capacity of your cluster to see if you
-are reaching the upper end of its capacity. As your cluster reaches its ``near
-full`` ratio, you should add one or more OSDs to expand your cluster's capacity.
-
-.. warning:: Do not let your cluster reach its ``full ratio`` before
- adding an OSD. OSD failures that occur after the cluster reaches
- its ``near full`` ratio may cause the cluster to exceed its
- ``full ratio``.
-
-Deploy your Hardware
---------------------
-
-If you are adding a new host when adding a new OSD,
-see `Hardware Recommendations`_ for details on minimum recommendations
-for OSD hardware. To add a OSD host to your cluster, first make sure you have
-an up-to-date version of Linux installed (typically Ubuntu 12.04 precise),
-and you have made some initial preparations for your storage disks.
-See `Filesystem Recommendations`_ for details.
-
-Add your OSD host to a rack in your cluster, connect it to the network
-and ensure that it has network connectivity.
-
-.. _Hardware Recommendations: ../../install/hardware-recommendations
-.. _Filesystem Recommendations: ../../config-cluster/file-system-recommendations
-
-Install the Required Software
------------------------------
-
-For manually deployed clusters, you must install Ceph packages
-manually. See `Installing Debian/Ubuntu Packages`_ for details.
-You should configure SSH to a user with password-less authentication
-and root permissions.
-
-.. _Installing Debian/Ubuntu Packages: ../../install/debian
-
-For clusters deployed with Chef, create a `chef user`_, `configure
-SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
-`Installing Chef`_ for details.
-
-.. _chef user: ../../install/chef#createuser
-.. _configure SSH keys: ../../install/chef#genkeys
-.. _install the Chef client: ../../install/chef#installchef
-.. _Installing Chef: ../../install/chef
-.. _Install Ruby: ../../install/chef#installruby
-
-Adding an OSD (Manual)
-----------------------
-
-This procedure sets up an ``ceph-osd`` daemon, configures it to use one disk,
-and configures the cluster to distribute data to the OSD. If your host has
-multiple disks, you may add an OSD for each disk by repeating this procedure.
-
-To add an OSD, create a data directory for it, mount a disk to that directory,
-add the OSD to your configuration file, add the OSD to the cluster, and then
-add it to the CRUSH map.
-
-When you add the OSD to the CRUSH map, consider the weight you give to the new
-OSD. Hard disk capacity grows 40% per year, so newer OSD hosts may have larger
-hard disks than older hosts in the cluster (i.e., they may have greater weight).
-
-#. Create the default directory on your new OSD. ::
-
- ssh {new-osd-host}
- sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
-
-
-#. If the OSD is for a disk other than the OS disk, prepare it
- for use with Ceph, and mount it to the directory you just created::
-
- ssh {new-osd-host}
- sudo mkfs -t {fstype} /dev/{disk}
- sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
-
-
-#. Navigate to the host where you keep the master copy of the cluster's
- ``ceph.conf`` file. ::
-
- ssh {admin-host}
- cd /etc/ceph
- vim ceph.conf
-
-#. Add the new OSD to your ``ceph.conf`` file.
-
- .. code-block:: ini
-
- [osd.123]
- host = {hostname}
-
-#. From the host where you keep the master copy of the cluster's
- ``ceph.conf`` file, copy the updated ``ceph.conf`` file to your
- new OSD's ``/etc/ceph`` directory and to other hosts in your cluster. ::
-
- ssh {new-osd} sudo tee /etc/ceph/ceph.conf < /etc/ceph/ceph.conf
-
-#. Create the OSD. ::
-
- ceph osd create {osd-num}
- ceph osd create 123 #for example
-
-#. Initialize the OSD data directory. ::
-
- ssh {new-osd-host}
- ceph-osd -i {osd-num} --mkfs --mkkey
-
- The directory must be empty before you can run ``ceph-osd``.
-
-#. Register the OSD authentication key. The value of ``ceph`` for
- ``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your
- cluster name differs from ``ceph``, use your cluster name instead.::
-
- ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
-
-#. Add the OSD to the CRUSH map so that it can begin receiving data. You may
- also decompile the CRUSH map, add the OSD to the device list, add the host as a
- bucket (if it's not already in the CRUSH map), add the device as an item in the
- host, assign it a weight, recompile it and set it. See `Add/Move an OSD`_ for
- details. ::
-
- ceph osd crush set {id} {name} {weight} [{bucket-type}={bucket-name}, ...]
-
-
-.. topic:: Argonaut (v0.48) Best Practices
-
- To limit impact on user I/O performance, add an OSD to the CRUSH map
- with an initial weight of ``0``. Then, ramp up the CRUSH weight a
- little bit at a time. For example, to ramp by increments of ``0.2``,
- start with::
-
- ceph osd crush reweight {osd-id} .2
-
- and allow migration to complete before reweighting to ``0.4``,
- ``0.6``, and so on until the desired CRUSH weight is reached.
-
- To limit the impact of OSD failures, you can set::
-
- mon osd down out interval = 0
-
- which prevents down OSDs from automatically being marked out, and then
- ramp them down manually with::
-
- ceph osd reweight {osd-num} .8
-
- Again, wait for the cluster to finish migrating data, and then adjust
- the weight further until you reach a weight of 0. Note that this
- problem prevents the cluster to automatically re-replicate data after
- a failure, so please ensure that sufficient monitoring is in place for
- an administrator to intervene promptly.
-
- Note that this practice will no longer be necessary in Bobtail and
- subsequent releases.
-
-
-Adding an OSD (Chef)
---------------------
-
-This procedure configures your OSD using ``chef-client``. If your host has
-multiple disks, you may need to execute the procedure for preparing an OSD disk
-for each data disk on your host.
-
-When you add the OSD to the CRUSH map, consider the weight you give to the new
-OSD. Hard disk capacity grows 40% per year, so newer OSD hosts may have larger
-hard disks than older hosts in the cluster.
-
-#. Execute ``chef-client`` to register it with Chef as a Chef node.
-
-#. Edit the node. See `Configure Nodes`_ for details.
- Change its environment to your Chef environment.
- Add ``"role[ceph-osd]"`` to the run list.
-
-#. Execute `Prepare OSD Disks`_ for each disk.
-
-#. Execute ``chef-client`` to invoke the run list.
-
-#. Add the OSD to the CRUSH map so that it can begin receiving data. You may
- also decompile the CRUSH map edit the file, recompile it and set it. See
- `Add/Move an OSD`_ for details. ::
-
- ceph osd crush set {id} {name} {weight} pool={pool-name} [{bucket-type}={bucket-name}, ...]
-
-
-Starting the OSD
-----------------
-
-After you add an OSD to Ceph, the OSD is in your configuration. However,
-it is not yet running. The OSD is ``down`` and ``out``. You must start
-your new OSD before it can begin receiving data. You may use
-``service ceph`` from your admin host or start the OSD from its host
-machine::
-
- service ceph -a start osd.{osd.num}
- #or alternatively
- ssh {new-osd-host}
- sudo /etc/init.d/ceph start osd.{osd-num}
-
-
-Once you start your OSD, it is ``up``.
-
-Put the OSD ``in`` the Cluster
-------------------------------
-
-After you start your OSD, it is ``up`` and ``out``. You need to put it in to
-the cluster so that Ceph can begin writing data to it. ::
-
- ceph osd in {osd-num}
-
-
-Observe the Data Migration
---------------------------
-
-Once you have added your new OSD to the CRUSH map, Ceph will begin rebalancing
-the server by migrating placement groups to your new OSD. You can observe this
-process with the `ceph`_ tool. ::
-
- ceph -w
-
-You should see the placement group states change from ``active+clean`` to
-``active, some degraded objects``, and finally ``active+clean`` when migration
-completes. (Control-c to exit.)
-
-
-.. _Add/Move an OSD: ../crush-map#addosd
-.. _Configure Nodes: ../../config-cluster/chef#confignodes
-.. _Prepare OSD Disks: ../../config-cluster/chef#prepdisks
-.. _ceph: ../monitoring
-
-
-
-Removing OSDs
-=============
-
-When you want to reduce the size of a cluster or replace hardware, you may
-remove an OSD at runtime. With Ceph, an OSD is generally one Ceph ``ceph-osd``
-daemon for one storage disk within a host machine. If your host has multiple
-storage disks, you may need to remove one ``ceph-osd`` daemon for each disk.
-Generally, it's a good idea to check the capacity of your cluster to see if you
-are reaching the upper end of its capacity. Ensure that when you remove an OSD
-that your cluster is not at its ``near full`` ratio.
-
-.. warning:: Do not let your cluster reach its ``full ratio`` when
- removing an OSD. Removing OSDs could cause the cluster to reach
- or exceed its ``full ratio``.
-
-
-Take the OSD ``out`` of the Cluster
------------------------------------
-
-Before you remove an OSD, it is usually ``up`` and ``in``. You need to take it
-out of the cluster so that Ceph can begin rebalancing and copying its data to
-other OSDs. ::
-
- ceph osd out {osd-num}
-
-
-Observe the Data Migration
---------------------------
-
-Once you have taken your OSD ``out`` of the cluster, Ceph will begin
-rebalancing the cluster by migrating placement groups out of the OSD you
-removed. You can observe this process with the `ceph`_ tool. ::
-
- ceph -w
-
-You should see the placement group states change from ``active+clean`` to
-``active, some degraded objects``, and finally ``active+clean`` when migration
-completes. (Control-c to exit.)
-
-
-Stopping the OSD
-----------------
-
-After you take an OSD out of the cluster, it may still be running.
-That is, the OSD may be ``up`` and ``out``. You must stop
-your OSD before you remove it from the configuration. ::
-
- ssh {new-osd-host}
- sudo /etc/init.d/ceph stop osd.{osd-num}
-
-Once you stop your OSD, it is ``down``.
-
-
-Removing an OSD (Manual)
-------------------------
-
-This procedure removes an OSD from a cluster map, removes its authentication
-key, removes the OSD from the OSD map, and removes the OSD from the
-``ceph.conf`` file. If your host has multiple disks, you may need to remove an
-OSD for each disk by repeating this procedure.
-
-
-#. Remove the OSD from the CRUSH map so that it no longer receives data. You may
- also decompile the CRUSH map, remove the OSD from the device list, remove the
- device as an item in the host bucket or remove the host bucket (if it's in the
- CRUSH map and you intend to remove the host), recompile the map and set it.
- See `Remove an OSD`_ for details. ::
-
- ceph osd crush remove {name}
-
-#. Remove the OSD authentication key. ::
-
- ceph auth del osd.{osd-num}
-
- The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the ``$cluster-$id``.
- If your cluster name differs from ``ceph``, use your cluster name instead.
-
-#. Remove the OSD. ::
-
- ceph osd rm {osd-num}
- #for example
- ceph osd rm 123
-
-#. Navigate to the host where you keep the master copy of the cluster's
- ``ceph.conf`` file. ::
-
- ssh {admin-host}
- cd /etc/chef
- vim ceph.conf
-
-#. Remove the OSD entry from your ``ceph.conf`` file. ::
-
- [osd.123]
- host = {hostname}
-
-#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file,
- copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other
- hosts in your cluster. ::
-
- ssh {osd} sudo tee /etc/ceph/ceph.conf < /etc/ceph/ceph.conf
-
-.. _Remove an OSD: ../crush-map#removeosd
+++ /dev/null
-=====================================
- Ceph Authentication & Authorization
-=====================================
-
-Ceph is a distributed storage system where a typical deployment involves a
-relatively small quorum of *monitors*, scores of *metadata servers* (MDSs) and
-many thousands of OSD daemons operating across many hosts/nodes--representing
-the server portion of the Ceph object store. Ceph clients such as CephFS, Ceph
-block device and Ceph Gateway interact with the Ceph object store. All Ceph
-object store clients use the ``librados`` library to interact with the Ceph
-object store. The following diagram illustrates an abstract client/server
-technology stack.
-
-.. ditaa:: +---------------------------------------------------+
- | client |
- +---------------------------------------------------+
- | librados |
- +---------------------------------------------------+
- +---------------+ +---------------+ +---------------+
- | OSDs | | MDSs | | Monitors |
- +---------------+ +---------------+ +---------------+
-
-Users are either individuals or system actors such as applications, which
-use Ceph clients to interact with Ceph server daemons.
-
-.. ditaa:: +-----+
- | {o} |
- | |
- +--+--+ /---------\ /---------\
- | | Ceph | | Ceph |
- ---+---*----->| |<------------->| |
- | uses | Clients | | Servers |
- | \---------/ \---------/
- /--+--\
- | |
- | |
- actor
-
-For additional information, see our `Cephx Guide`_ and `ceph-authtool manpage`_.
-
-.. _Cephx Guide: ../authentication
-.. _ceph-authtool manpage: ../../man/8/ceph-authtool/
-
-Ceph Authentication (cephx)
-===========================
-
-Cryptographic authentication has some computational costs, though they should
-generally be quite low. If the network environment connecting your client and
-server hosts is very safe and you cannot afford authentication, you can use a
-Ceph option to turn it off. **This is not generally recommended**, but should you
-need to do so, details can be found in the `Disable Cephx`_ section.
-
-.. important:: Remember, if you disable authentication, you are at risk of a
- man-in-the-middle attack altering your client/server messages, which could
- lead to disastrous security effects.
-
-A key scalability feature of Ceph is to avoid a centralized interface to the
-Ceph object store, which means that Ceph clients must be able to interact with
-OSDs directly. To protect data, Ceph provides its ``cephx`` authentication
-system, which authenticates users operating Ceph clients. The ``cephx`` protocol
-operates in a manner with behavior similar to `Kerberos`_.
-
-.. _Disable Cephx: ../authentication#disable-cephx
-.. _Kerberos: http://en.wikipedia.org/wiki/Kerberos_(protocol)
-
-A user/actor invokes a Ceph client to contact a monitor. Unlike Kerberos, each
-monitor can authenticate users and distribute keys, so there is no single point
-of failure or bottleneck when using ``cephx``. The monitor returns an
-authentication data structure similar to a Kerberos ticket that contains a
-session key for use in obtaining Ceph services. This session key is itself
-encrypted with the user's permanent secret key, so that only the user can
-request services from the Ceph monitor(s). The client then uses the session key
-to request its desired services from the monitor, and the monitor provides the
-client with a ticket that will authenticate the client to the OSDs that actually
-handle data. Ceph monitors and OSDs share a secret, so the client can use the
-ticket provided by the monitor with any OSD or metadata server in the cluster.
-Like Kerberos, ``cephx`` tickets expire, so an attacker cannot use an expired
-ticket or session key obtained surreptitiously. This form of authentication will
-prevent attackers with access to the communications medium from either creating
-bogus messages under another user's identity or altering another user's
-legitimate messages, as long as the user's secret key is not divulged before it
-expires.
-
-To use ``cephx``, an administrator must set up users first. In the following
-diagram, the ``client.admin`` user invokes ``ceph auth get-or-create-key`` from
-the command line to generate a username and secret key. Ceph's ``auth``
-subsystem generates the username and key, stores a copy with the monitor(s) and
-transmits the user's secret back to the ``client.admin`` user. This means that
-the client and the monitor share a secret key.
-
-.. note:: The ``client.admin`` user must provide the user ID and
- secret key to the user in a secure manner.
-
-.. ditaa:: +---------+ +---------+
- | Client | | Monitor |
- +---------+ +---------+
- | request to |
- | create a user |
- |-------------->|----------+ create user
- | | | and
- |<--------------|<---------+ store key
- | transmit key |
- | |
-
-
-To authenticate with the monitor, the client passes in the user name to the
-monitor, and the monitor generates a session key and encrypts it with the secret
-key associated to the user name. Then, the monitor transmits the encrypted
-ticket back to the client. The client then decrypts the payload with the shared
-secret key to retrieve the session key. The session key identifies the user for
-the current session. The client then requests a ticket on behalf of the user
-signed by the session key. The monitor generates a ticket, encrypts it with the
-user's secret key and transmits it back to the client. The client decrypts the
-ticket and uses it to sign requests to OSDs and metadata servers throughout the
-cluster.
-
-.. ditaa:: +---------+ +---------+
- | Client | | Monitor |
- +---------+ +---------+
- | authenticate |
- |-------------->|----------+ generate and
- | | | encrypt
- |<--------------|<---------+ session key
- | transmit |
- | encrypted |
- | session key |
- | |
- |-----+ decrypt |
- | | session |
- |<----+ key |
- | |
- | req. ticket |
- |-------------->|----------+ generate and
- | | | encrypt
- |<--------------|<---------+ ticket
- | recv. ticket |
- | |
- |-----+ decrypt |
- | | ticket |
- |<----+ |
-
-
-The ``cephx`` protocol authenticates ongoing communications between the client
-machine and the Ceph servers. Each message sent between a client and server,
-subsequent to the initial authentication, is signed using a ticket that the
-monitors, OSDs and metadata servers can verify with their shared secret.
-
-.. ditaa:: +---------+ +---------+ +-------+ +-------+
- | Client | | Monitor | | MDS | | OSD |
- +---------+ +---------+ +-------+ +-------+
- | request to | | |
- | create a user | | |
- |-------------->| mon and | |
- |<--------------| client share | |
- | receive | a secret. | |
- | shared secret | | |
- | |<------------>| |
- | |<-------------+------------>|
- | | mon, mds, | |
- | authenticate | and osd | |
- |-------------->| share | |
- |<--------------| a secret | |
- | session key | | |
- | | | |
- | req. ticket | | |
- |-------------->| | |
- |<--------------| | |
- | recv. ticket | | |
- | | | |
- | make request (CephFS only) | |
- |----------------------------->| |
- |<-----------------------------| |
- | receive response (CephFS only) |
- | |
- | make request |
- |------------------------------------------->|
- |<-------------------------------------------|
- receive response
-
-The protection offered by this authentication is between the Ceph client and the
-Ceph server hosts. The authentication is not extended beyond the Ceph client. If
-the user accesses the Ceph client from a remote host, Ceph authentication is not
-applied to the connection between the user's host and the client host.
-
-
-Ceph Authorization (caps)
-=========================
-
-Ceph uses the term "capabilities" (caps) to describe authorizing an
-authenticated user to exercise the functionality of the monitors, OSDs and
-metadata servers. Capabilities can also restrict access to data within one or
-more pools.
-
-.. note:: Ceph uses the capabilities discussed here for setting up and
- controlling access between various Ceph client and server instances, and
- are relevant regardless of what type of client accesses the Ceph object
- store. CephFS uses a different type of capability for files and directories
- internal to the CephFS filesystem. CephFS filesystem access controls are
- relevant to CephFS, but not block devices or the RESTful gateway.
-
-A Ceph ``client.admin`` user sets a user's capabilities when creating
-the user.
-
-
-``allow``
-
-:Description: Precedes access settings for a daemon. Implies ``rw`` for MDS only.
-:Example: ``ceph-authtool -n client.foo --cap mds 'allow'``
-
-
-``r``
-
-:Description: Gives the user read access. Required with monitors to retrieve the CRUSH map.
-:Example: ``ceph-authtool -n client.foo --cap mon 'allow r'``
-
-
-``w``
-
-:Description: Gives the user write access to objects.
-:Example: ``ceph-authtool -n client.foo --cap osd 'allow w'``
-
-
-``x``
-
-:Description: Gives the user the capability to call class methods (i.e., both read and write).
-:Example: ``ceph-authtool -n client.foo --cap osd 'allow x'``
-
-
-``class-read``
-
-:Descriptions: Gives the user the capability to call class read methods. Subset of ``x``.
-:Example: ``ceph-authtool -n client.foo --cap osd 'allow class-read'``
-
-
-``class-write``
-
-:Description: Gives the user the capability to call class write methods. Subset of ``x``.
-:Example: ``ceph-authtool -n client.foo --cap osd 'allow class-write'``
-
-
-``*``
-
-:Description: Gives the user read, write and execute permissions for a particular daemon/pool, and the ability to execute admin commands.
-:Example: ``ceph-authtool -n client.foo --cap osd 'allow *'``
-
-
-When setting capabilities for a user, Ceph also supports restricting the
-capabilities to a particular pool. This means you can have full access to some
-pools, and restricted (or no) access to other pools for the same user.
-For example::
-
- ceph-authtool -n client.foo --cap osd 'allow rwx' pool=customer-pool
-
-
-
-Cephx Limitations
-=================
-
-The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
-is not intended to handle authentication of human users or application programs
-run on their behalf. If that effect is required to handle your access control
-needs, you must have another mechanism, which is likely to be specific to the
-front end used to access the Ceph object store. This other mechanism has the
-role of ensuring that only acceptable users and programs are able to run on the
-machine that Ceph will permit to access its object store.
-
-The keys used to authenticate Ceph clients and servers are typically stored in
-a plain text file with appropriate permissions in a trusted host.
-
-.. important:: Storing keys in plaintext files has security shortcomings, but
- they are difficult to avoid, given the basic authentication methods Ceph
- uses in the background. Those setting up Ceph systems should be aware of
- these shortcomings.
-
-In particular, arbitrary user machines, especially portable machines, should not
-be configured to interact directly with Ceph, since that mode of use would
-require the storage of a plaintext authentication key on an insecure machine.
-Anyone who stole that machine or obtained surreptitious access to it could
-obtain the key that will allow them to authenticate their own machines to Ceph.
-
-Rather than permitting potentially insecure machines to access a Ceph object
-store directly, users should be required to sign in to a trusted machine in
-your environment using a method that provides sufficient security for your
-purposes. That trusted machine will store the plaintext Ceph keys for the
-human users. A future version of Ceph may address these particular
-authentication issues more fully.
-
-At the moment, none of the Ceph authentication protocols provide secrecy for
-messages in transit. Thus, an eavesdropper on the wire can hear and understand
-all data sent between clients and servers in Ceph, even if he cannot create or
-alter them. Further, Ceph does not include options to encrypt user data in the
-object store. Users can hand-encrypt and store their own data in the Ceph
-object store, of course, but Ceph provides no features to perform object
-encryption itself. Those storing sensitive data in Ceph should consider
-encrypting their data before providing it to the Ceph system.
+++ /dev/null
-=============
- Cephx Guide
-=============
-
-Ceph provides two authentication modes:
-
-- **None:** Any user can access data without authentication.
-- **Cephx**: Ceph requires user authentication in a manner similar to Kerberos.
-
-If you disable ``cephx``, you do not need to generate keys using the procedures
-described here. If you re-enable ``cephx`` and have already generated keys, you
-do not need to generate the keys again.
-
-.. important: The ``cephx`` protocol does not address data encryption in transport
- (e.g., SSL/TLS) or encryption at rest.
-
-For additional information, see our `Cephx Intro`_ and `ceph-authtool manpage`_.
-
-.. _Cephx Intro: ../auth-intro
-.. _ceph-authtool manpage: ../../man/8/ceph-authtool/
-
-
-Configuring Cephx
-=================
-
-There are several important procedures you must follow to enable the ``cephx``
-protocol for your Ceph cluster and its daemons. First, you must generate a
-secret key for the default ``client.admin`` user so the administrator can
-execute Ceph commands. Second, you must generate a monitor secret key and
-distribute it to all monitors in the cluster. Finally, you can follow the
-remaining steps in `Enabling Cephx`_ to enable authentication.
-
-.. _client-admin-key:
-
-The ``client.admin`` Key
-------------------------
-
-When you first install Ceph, each Ceph command you execute on the command line
-assumes that you are the ``client.admin`` default user. When running Ceph with
-``cephx`` enabled, you need to have a key for the ``client.admin`` user to run
-``ceph`` commands as the administrator.
-
-.. important: To run Ceph commands on the command line with
- ``cephx`` enabled, you need to create a key for the ``client.admin``
- user, and create a secret file under ``/etc/ceph``.
-
-The following command will generate and register a ``client.admin``
-key on the monitor with admin capabilities and write it to a keyring
-on the local file system. If the key already exists, its current
-value will be returned. ::
-
- sudo ceph auth get-or-create client.admin mds 'allow' osd 'allow *' mon 'allow *' > /etc/ceph/keyring
-
-See `Enabling Cephx`_ step 1 for stepwise details to enable ``cephx``.
-
-
-Monitor Keyrings
-----------------
-
-Ceph requires a keyring for the monitors. Use the `ceph-authtool`_ command to
-generate a secret monitor key and keyring. ::
-
- sudo ceph-authtool {keyring} --create-keyring --gen-key -n mon.
-
-A cluster with multiple monitors must have identical keyrings for all
-monitors. So you must copy the keyring to each monitor host under the
-following directory::
-
- /var/lib/ceph/mon/$cluster-$id
-
-See `Enabling Cephx`_ step 2 and 3 for stepwise details to enable ``cephx``.
-
-.. _ceph-authtool: ../../man/8/ceph-authtool/
-
-
-.. _enable-cephx:
-
-Enabling Cephx
---------------
-
-When ``cephx`` is enabled, Ceph will look for the keyring in the default search
-path, which includes ``/etc/ceph/keyring``. You can override this location by
-adding a ``keyring`` option in the ``[global]`` section of your `Ceph
-configuration`_ file, but this is not recommended.
-
-Execute the following procedures to enable ``cephx`` on a cluster with ``cephx``
-disabled. If you (or your deployment utility) have already generated the keys,
-you may skip the steps related to generating keys.
-
-#. Create a ``client.admin`` key, and save a copy of the key for your client host::
-
- ceph auth get-or-create client.admin mon 'allow *' mds 'allow *' osd 'allow *' -o /etc/ceph/keyring
-
- **Warning:** This will clobber any existing ``/etc/ceph/keyring`` file. Be careful!
-
-#. Generate a secret monitor ``mon.`` key::
-
- ceph-authtool --create --gen-key -n mon. /tmp/monitor-key
-
-#. Copy the mon keyring into a ``keyring`` file in every monitor's ``mon data`` directory::
-
- cp /tmp/monitor-key /var/lib/ceph/mon/ceph-a/keyring
-
-#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number::
-
- ceph auth get-or-create osd.{$id} mon 'allow rwx' osd 'allow *' -o /var/lib/ceph/osd/ceph-{$id}/keyring
-
-#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter::
-
- ceph auth get-or-create mds.{$id} mon 'allow rwx' osd 'allow *' mds 'allow *' -o /var/lib/ceph/mds/ceph-{$id}/keyring
-
-#. Enable ``cephx`` authentication for versions ``0.51`` and above by setting
- the following options in the ``[global]`` section of your `Ceph configuration`_
- file::
-
- auth cluster required = cephx
- auth service required = cephx
- auth client required = cephx
-
-#. Or, enable ``cephx`` authentication for versions ``0.50`` and below by
- setting the following option in the ``[global]`` section of your `Ceph
- configuration`_ file::
-
- auth supported = cephx
-
-.. deprecated:: 0.51
-
-#. Start or restart the Ceph cluster. ::
-
- sudo service ceph -a start
- sudo service ceph -a restart
-
-.. _disable-cephx:
-
-Disabling Cephx
----------------
-
-The following procedure describes how to disable Cephx. If your cluster
-environment is relatively safe, you can offset the computation expense of
-running authentication. **We do not recommend it.** However, it may be
-easier during setup and/or troubleshooting to temporarily disable authentication.
-
-#. Disable ``cephx`` authentication for versions ``0.51`` and above by setting
- the following options in the ``[global]`` section of your `Ceph configuration`_
- file::
-
- auth cluster required = none
- auth service required = none
- auth client required = none
-
-#. Or, disable ``cephx`` authentication for versions ``0.50`` and below
- (deprecated as of version 0.51) by setting the following option in the
- ``[global]`` section of your `Ceph configuration`_ file::
-
- auth supported = none
-
-#. Start or restart the Ceph cluster. ::
-
- sudo service ceph -a start
- sudo service ceph -a restart
-
-
-Daemon Keyrings
----------------
-
-With the exception of the monitors, daemon keyrings are generated in
-the same way that user keyrings are. By default, the daemons store
-their keyrings inside their data directory. The default keyring
-locations, and the capabilities necessary for the daemon to function,
-are shown below.
-
-``ceph-mon``
-
-:Location: ``$mon_data/keyring``
-:Capabilities: N/A
-
-``ceph-osd``
-
-:Location: ``$osd_data/keyring``
-:Capabilities: ``mon 'allow rwx' osd 'allow *'``
-
-``ceph-mds``
-
-:Location: ``$mds_data/keyring``
-:Capabilities: ``mds 'allow rwx' mds 'allow *' osd 'allow *'``
-
-``radosgw``
-
-:Location: ``$rgw_data/keyring``
-:Capabilities: ``mon 'allow r' osd 'allow rwx'``
-
-
-Note that the monitor keyring contains a key but no capabilities, and
-is not part of the cluster ``auth`` database.
-
-The daemon data directory locations default to directories of the form::
-
- /var/lib/ceph/$type/$cluster-$id
-
-For example, ``osd.12`` would be::
-
- /var/lib/ceph/osd/ceph-12
-
-You can override these locations, but it is not recommended.
-
-Cephx Administration
-====================
-
-Cephx uses shared secret keys for authentication, meaning both the client and
-the monitor cluster have a copy of the client's secret key. The authentication
-protocol is such that both parties are able to prove to each other they have a
-copy of the key without actually revealing it. This provides mutual
-authentication, which means the cluster is sure the user possesses the secret
-key, and the user is sure that the cluster has a copy of the secret key.
-
-Default users and pools are suitable for initial testing purposes. For test bed
-and production environments, you should create users and assign pool access to
-the users.
-
-.. _add-a-key:
-
-Add a Key
----------
-
-Keys enable a specific user to access the monitor, metadata server and
-cluster according to capabilities assigned to the key. Capabilities are
-simple strings specifying some access permissions for a given server type.
-Each server type has its own string. All capabilities are simply listed
-in ``{type}`` and ``{capability}`` pairs on the command line::
-
- sudo ceph auth get-or-create-key client.{username} {daemon1} {cap1} {daemon2} {cap2} ...
-
-For example, to create a user ``client.foo`` with access 'rw' for
-daemon type 'osd' and 'r' for daemon type 'mon'::
-
- sudo ceph auth get-or-create-key client.foo osd rw mon r > keyring.foo
-
-.. note: User names are associated to user types, which include ``client``
- ``admin``, ``osd``, ``mon``, and ``mds``. In most cases, you will be
- creating keys for ``client`` users.
-
-.. _auth-delete-key:
-
-Delete a Key
-------------
-
-To delete a key for a user or a daemon, use ``ceph auth del``::
-
- ceph auth del {daemon-type}.{ID|username}
-
-Where ``{daemon-type}`` is one of ``client``, ``osd``, ``mon``, or ``mds``,
-and ``{ID|username}`` is the ID of the daemon or the username.
-
-List Keys in your Cluster
--------------------------
-
-To list the keys registered in your cluster::
-
- sudo ceph auth list
-
-
-Cephx Commandline Options
-=========================
-
-When Ceph runs with Cephx enabled, you must specify a user name and a secret key
-on the command line. Alternatively, you may use the ``CEPH_ARGS`` environment
-variable to avoid re-entry of the user name and secret. ::
-
- ceph --id {user-name} --keyring=/path/to/secret [commands]
-
-For example::
-
- ceph --id client.admin --keyring=/etc/ceph/ceph.keyring [commands]
-
-
-Ceph supports the following usage for user name and secret:
-
-``--id`` | ``--user``
-
-:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
- ``client.admin``, ``client.user1``). The ``id``, ``name`` and
- ``-n`` options enable you to specify the ID portion of the user
- name (e.g., ``admin``, ``user1``, ``foo``, etc.). You can specify
- the user with the ``--id`` and omit the type. For example,
- to specify user ``client.foo`` enter the following::
-
- ceph --id foo --keyring /path/to/keyring health
- ceph --user foo --keyring /path/to/keyring health
-
-
-``--name``
-
-:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
- ``client.admin``, ``client.user1``). The ``--name`` and ``-n``
- options enables you to specify the fully qualified user name.
- You must specify the user type (typically ``client``) with the
- user ID. For example::
-
- ceph --name client.foo --keyring /path/to/keyring health
- ceph -n client.foo --keyring /path/to/keyring health
-
-
-
-``--keyring``
-
-:Description: The path to the keyring containing one or more user name and
- secret. The ``--secret`` option provides the same functionality,
- but it does not work with Ceph RADOS Gateway, which uses
- ``--secret`` for another purpose. You may retrieve a keyring with
- ``ceph auth get-or-create`` and store it locally. This is a
- preferred approach, because you can switch user names without
- switching the keyring path. For example::
-
- sudo rbd map foo --pool rbd myimage --id client.foo --keyring /path/to/keyring
-
-
-``--keyfile``
-
-:Description: The path to the key file containing the secret key for the user
- specified by ``--id``, ``--name``, ``-n``, or ``--user``. You may
- retrieve the key for a specific user with ``ceph auth get`` and
- store it locally. Then, specify the path to the keyfile.
- For example::
-
- sudo rbd map foo --pool rbd myimage --id client.foo --keyfile /path/to/file
-
-
-.. note:: Add the user and secret to the ``CEPH_ARGS`` environment variable so that
- you don’t need to enter them each time. You can override the environment
- variable settings on the command line.
-
-
-Backward Compatibility
-======================
-
-.. versionadded:: Bobtail
-
-In Ceph Argonaut v0.48 and earlier versions, if you enable ``cephx``
-authentication, Ceph only authenticates the initial communication between the
-client and daemon; Ceph does not authenticate the subsequent messages they send
-to each other, which has security implications. In Ceph Bobtail and subsequent
-versions, Ceph authenticates all ongoing messages between the entities using the
-session key set up for that initial authentication.
-
-We identified a backward compatibility issue between Argonaut v0.48 (and prior
-versions) and Bobtail (and subsequent versions). During testing, if you
-attempted to use Argonaut (and earlier) daemons with Bobtail (and later)
-daemons, the Argonaut daemons did not know how to perform ongoing message
-authentication, while the Bobtail versions of the daemons insist on
-authenticating message traffic subsequent to the initial
-request/response--making it impossible for Argonaut (and prior) daemons to
-interoperate with Bobtail (and subsequent) daemons.
-
-We have addressed this potential problem by providing a means for Argonaut (and
-prior) systems to interact with Bobtail (and subsequent) systems. Here's how it
-works: by default, the newer systems will not insist on seeing signatures from
-older systems that do not know how to perform them, but will simply accept such
-messages without authenticating them. This new default behavior provides the
-advantage of allowing two different releases to interact. **We do not recommend
-this as a long term solution**. Allowing newer daemons to forgo ongoing
-authentication has the unfortunate security effect that an attacker with control
-of some of your machines or some access to your network can disable session
-security simply by claiming to be unable to sign messages.
-
-.. note:: Even if you don't actually run any old versions of Ceph,
- the attacker may be able to force some messages to be accepted unsigned in the
- default scenario. While running Cephx with the default scenario, Ceph still
- authenticates the initial communication, but you lose desirable session security.
-
-If you know that you are not running older versions of Ceph, or you are willing
-to accept that old servers and new servers will not be able to interoperate, you
-can eliminate this security risk. If you do so, any Ceph system that is new
-enough to support session authentication and that has Cephx enabled will reject
-unsigned messages. To preclude new servers from interacting with old servers,
-include the following line into the ``[global]`` section of your `Ceph
-configuration`_ file directly below the line that specifies the use of Cephx
-for authentication::
-
- cephx require signatures = true
-
-**We recommend migrating all daemons to the newer versions and enabling the
-foregoing flag** at the nearest practical time so that you may avail yourself
-of the enhanced authentication.
-
-.. _Ceph configuration: ../../config-cluster/ceph-conf
+++ /dev/null
-.. index:: control, commands
-
-==================
- Control Commands
-==================
-
-
-Monitor Commands
-================
-
-Monitor commands are issued using the ceph utility::
-
- ceph [-m monhost] {command}
-
-The command is usually (though not always) of the form::
-
- ceph {subsystem} {command}
-
-
-System Commands
-===============
-
-Execute the following to display the current status of the cluster. ::
-
- ceph -s
- ceph status
-
-Execute the following to display a running summary of the status of the cluster,
-and major events. ::
-
- ceph -w
-
-Execute the following to show the monitor quorum, including which monitors are
-participating and which one is the leader. ::
-
- ceph quorum_status
-
-Execute the following to query the status of a single monitor, including whether
-or not it is in the quorum. ::
-
- ceph [-m monhost] mon_status
-
-
-Authentication Subsystem
-========================
-
-To add a keyring for an OSD, execute the following::
-
- ceph auth add {osd} {--in-file|-i} {path-to-osd-keyring}
-
-To list the cluster's keys and their capabilities, execute the following::
-
- ceph auth list
-
-
-Placement Group Subsystem
-=========================
-
-To display the statistics for all placement groups, execute the following::
-
- ceph -- pg dump [--format {format}]
-
-The valid formats are ``plain`` (default) and ``json``.
-
-To display the statistics for all placement groups stuck in a specified state,
-execute the following::
-
- ceph -- pg dump_stuck inactive|unclean|stale [--format {format}] [-t|--threshold {seconds}]
-
-
-``--format`` may be ``plain`` (default) or ``json``
-
-``--threshold`` defines how many seconds "stuck" is (default: 300)
-
-**Inactive** Placement groups cannot process reads or writes because they are waiting for an OSD
-with the most up-to-date data to come back.
-
-**Unclean** Placement groups contain objects that are not replicated the desired number
-of times. They should be recovering.
-
-**Stale** Placement groups are in an unknown state - the OSDs that host them have not
-reported to the monitor cluster in a while (configured by
-``mon_osd_report_timeout``).
-
-Revert "lost" objects to their prior state, either a previous version
-or delete them if they were just created. ::
-
- ceph pg {pgid} mark_unfound_lost revert
-
-
-OSD Subsystem
-=============
-
-Query osd subsystem status. ::
-
- ceph osd stat
-
-Write a copy of the most recent osd map to a file. See
-`osdmaptool`_. ::
-
- ceph osd getmap -o file
-
-.. _osdmaptool: ../../man/8/osdmaptool
-
-Write a copy of the crush map from the most recent osd map to
-file. ::
-
- ceph osd getcrushmap -o file
-
-The foregoing functionally equivalent to ::
-
- ceph osd getmap -o /tmp/osdmap
- osdmaptool /tmp/osdmap --export-crush file
-
-Dump the OSD map. Valid formats for ``-f`` are ``plain`` and ``json``. If no
-``--format`` option is given, the OSD map is dumped as plain text. ::
-
- ceph osd dump [--format {format}]
-
-Dump the OSD map as a tree with one line per OSD containing weight
-and state. ::
-
- ceph osd tree [--format {format}]
-
-Find out where a specific object is or would be stored in the system::
-
- ceph osd map <pool-name> <object-name>
-
-Add or move a new item (OSD) with the given id/name/weight at the specified
-location. ::
-
- ceph osd crush set {id} {weight} [{loc1} [{loc2} ...]]
-
-Remove an existing item from the CRUSH map. ::
-
- ceph osd crush remove {id}
-
-Move an existing bucket from one position in the hierarchy to another. ::
-
- ceph osd crush move {id} {loc1} [{loc2} ...]
-
-Set the weight of the item given by ``{name}`` to ``{weight}``. ::
-
- ceph osd crush reweight {name} {weight}
-
-Create a cluster snapshot. ::
-
- ceph osd cluster_snap {name}
-
-Mark an OSD as lost. This may result in permanent data loss. Use with caution. ::
-
- ceph osd lost [--yes-i-really-mean-it]
-
-Create a new OSD. If no ID is given, a new ID is automatically selected
-if possible. ::
-
- ceph osd create [{id}]
-
-Remove the given OSD(s). ::
-
- ceph osd rm [{id}...]
-
-Query the current max_osd parameter in the osd map. ::
-
- ceph osd getmaxosd
-
-Import the given OSD map. Note that this can be a bit dangerous,
-since the OSD map includes dynamic state about which OSDs are current
-on or offline; only do this if you've just modified a (very) recent
-copy of the map. ::
-
- ceph osd setmap -i file
-
-Import the given crush map. ::
-
- ceph osd setcrushmap -i file
-
-Set the ``max_osd`` parameter in the OSD map. This is necessary when
-expanding the storage cluster. ::
-
- ceph osd setmaxosd
-
-Mark OSD ``{osd-num}`` down. ::
-
- ceph osd down {osd-num}
-
-Mark OSD ``{osd-num}`` out of the distribution (i.e. allocated no data). ::
-
- ceph osd out {osd-num}
-
-Mark ``{osd-num}`` in the distribution (i.e. allocated data). ::
-
- ceph osd in {osd-num}
-
-List classes that are loaded in the ceph cluster. ::
-
- ceph class list
-
-Set or clear the pause flags in the OSD map. If set, no IO requests
-will be sent to any OSD. Clearing the flags via unpause results in
-resending pending requests. ::
-
- ceph osd pause
- ceph osd unpause
-
-Set the weight of ``{osd-num}`` to ``{weight}``. Two OSDs with the same weight will receive
-roughly the same number of I/O requests and store approximately the
-same amount of data. ::
-
- ceph osd reweight {osd-num} {weight}
-
-Reweights all the OSDs by reducing the weight of OSDs which are
-heavily overused. By default it will adjust the weights downward on
-OSDs which have 120% of the average utilization, but if you include
-threshold it will use that percentage instead. ::
-
- ceph osd reweight-by-utilization [threshold]
-
-Adds/removes the address to/from the blacklist. When adding an address,
-you can specify how long it should be blacklisted in seconds; otherwise,
-it will default to 1 hour. A blacklisted address is prevented from
-connecting to any OSD. Blacklisting is most often used to prevent a
-lagging metadata server from making bad changes to data on the OSDs.
-
-These commands are mostly only useful for failure testing, as
-blacklists are normally maintained automatically and shouldn't need
-manual intervention. ::
-
- ceph osd blacklist add ADDRESS[:source_port] [TIME]
- ceph osd blacklist rm ADDRESS[:source_port]
-
-Creates/deletes a snapshot of a pool. ::
-
- ceph osd pool mksnap {pool-name} {snap-name}
- ceph osd pool rmsnap {pool-name} {snap-name}
-
-Creates/deletes/renames a storage pool. ::
-
- ceph osd pool create {pool-name} pg_num [pgp_num]
- ceph osd pool delete {pool-name}
- ceph osd pool rename {old-name} {new-name}
-
-Changes a pool setting. ::
-
- ceph osd pool set {pool-name} {field} {value}
-
-Valid fields are:
-
- * ``size``: Sets the number of copies of data in the pool.
- * ``crash_replay_interval``: The number of seconds to allow
- clients to replay acknowledged but uncommited requests.
- * ``pg_num``: The placement group number.
- * ``pgp_num``: Effective number when calculating pg placement.
- * ``crush_ruleset``: rule number for mapping placement.
-
-Get the value of a pool setting. ::
-
- ceph osd pool get {pool-name} {field}
-
-Valid fields are:
-
- * ``pg_num``: The placement group number.
- * ``pgp_num``: Effective number of placement groups when calculating placement.
- * ``lpg_num``: The number of local placement groups.
- * ``lpgp_num``: The number used for placing the local placement groups.
-
-
-Sends a scrub command to OSD ``{osd-num}``. To send the command to all OSDs, use ``*``. ::
-
- ceph osd scrub {osd-num}
-
-Sends a repair command to osdN. To send the command to all osds, use ``*``. ::
-
- ceph osd repair N
-
-Runs a simple throughput benchmark against osdN, writing ``TOTAL_BYTES``
-in write requests of ``BYTES_PER_WRITE`` each. By default, the test
-writes 1 GB in total in 4-MB increments. ::
-
- ceph osd tell N bench [BYTES_PER_WRITE] [TOTAL_BYTES]
-
-
-MDS Subsystem
-=============
-
-Change configuration parameters on a running mds. ::
-
- ceph mds tell {mds-id} injectargs '--{switch} {value} [--{switch} {value}]'
-
-Example::
-
- ceph mds tell 0 injectargs '--debug_ms 1 --debug_mds 10'
-
-Enables debug messages. ::
-
- ceph mds stat
-
-Displays the status of all metadata servers.
-
-.. todo:: ``ceph mds`` subcommands missing docs: set_max_mds, dump, getmap, stop, setmap
-
-
-Mon Subsystem
-=============
-
-Show monitor stats::
-
- ceph mon stat
-
- 2011-12-14 10:40:59.044395 mon {- [mon,stat]
- 2011-12-14 10:40:59.057111 mon.1 -} 'e3: 5 mons at {a=10.1.2.3:6789/0,b=10.1.2.4:6789/0,c=10.1.2.5:6789/0,d=10.1.2.6:6789/0,e=10.1.2.7:6789/0}, election epoch 16, quorum 0,1,2,3' (0)
-
-The ``quorum`` list at the end lists monitor nodes that are part of the current quorum.
-
-This is also available more directly::
-
- $ ./ceph quorum_status
-
- 2011-12-14 10:44:20.417705 mon {- [quorum_status]
- 2011-12-14 10:44:20.431890 mon.0 -}
-
-.. code-block:: javascript
-
- '{ "election_epoch": 10,
- "quorum": [
- 0,
- 1,
- 2],
- "monmap": { "epoch": 1,
- "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
- "modified": "2011-12-12 13:28:27.505520",
- "created": "2011-12-12 13:28:27.505520",
- "mons": [
- { "rank": 0,
- "name": "a",
- "addr": "127.0.0.1:6789\/0"},
- { "rank": 1,
- "name": "b",
- "addr": "127.0.0.1:6790\/0"},
- { "rank": 2,
- "name": "c",
- "addr": "127.0.0.1:6791\/0"}]}}' (0)
-
-The above will block until a quorum is reached.
-
-For a status of just the monitor you connect to (use ``-m HOST:PORT``
-to select)::
-
- ceph mon_status
-
-
- 2011-12-14 10:45:30.644414 mon {- [mon_status]
- 2011-12-14 10:45:30.644632 mon.0 -}
-
-.. code-block:: javascript
-
- '{ "name": "a",
- "rank": 0,
- "state": "leader",
- "election_epoch": 10,
- "quorum": [
- 0,
- 1,
- 2],
- "outside_quorum": [],
- "monmap": { "epoch": 1,
- "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
- "modified": "2011-12-12 13:28:27.505520",
- "created": "2011-12-12 13:28:27.505520",
- "mons": [
- { "rank": 0,
- "name": "a",
- "addr": "127.0.0.1:6789\/0"},
- { "rank": 1,
- "name": "b",
- "addr": "127.0.0.1:6790\/0"},
- { "rank": 2,
- "name": "c",
- "addr": "127.0.0.1:6791\/0"}]}}' (0)
-
-A dump of the monitor state::
-
- ceph mon dump
-
- 2011-12-14 10:43:08.015333 mon {- [mon,dump]
- 2011-12-14 10:43:08.015567 mon.0 -} 'dumped monmap epoch 1' (0)
- epoch 1
- fsid 444b489c-4f16-4b75-83f0-cb8097468898
- last_changed 2011-12-12 13:28:27.505520
- created 2011-12-12 13:28:27.505520
- 0: 127.0.0.1:6789/0 mon.a
- 1: 127.0.0.1:6790/0 mon.b
- 2: 127.0.0.1:6791/0 mon.c
-
+++ /dev/null
-============
- CRUSH Maps
-============
-
-The :abbr:`CRUSH (Controlled Replication Under Scalable Hashing)` algorithm
-determines how to store and retrieve data by computing data storage locations.
-CRUSH empowers Ceph clients to communicate with OSDs directly rather than
-through a centralized server or broker. With an algorithmically determined
-method of storing and retrieving data, Ceph avoids a single point of failure, a
-performance bottleneck, and a physical limit to its scalability.
-
-CRUSH requires a map of your cluster, and uses the CRUSH map to pseudo-randomly
-store and retrieve data in OSDs with a uniform distribution of data across the
-cluster. For a detailed discussion of CRUSH, see
-`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_
-
-.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf
-
-CRUSH Maps contain a list of :abbr:`OSDs (Object Storage Devices)`, a list of
-'buckets' for aggregating the devices into physical locations, and a list of
-rules that tell CRUSH how it should replicate data in a Ceph cluster's pools. By
-reflecting the underlying physical organization of the installation, CRUSH can
-model—and thereby address—potential sources of correlated device failures.
-Typical sources include physical proximity, a shared power source, and a shared
-network. By encoding this information into the cluster map, CRUSH placement
-policies can separate object replicas across different failure domains while
-still maintaining the desired distribution. For example, to address the
-possibility of concurrent failures, it may be desirable to ensure that data
-replicas are on devices in different shelves, racks, power supplies,
-controllers, and/or physical locations.
-
-When you create a configuration file and deploy Ceph with ``mkcephfs``, Ceph
-generates a default CRUSH map for your configuration. The default CRUSH map is
-fine for your Ceph sandbox environment. However, when you deploy a large-scale
-data cluster, you should give significant consideration to developing a custom
-CRUSH map, because it will help you manage your Ceph cluster, improve
-performance and ensure data safety.
-
-For example, if an OSD goes down, a CRUSH Map can help you can locate
-the physical data center, room, row and rack of the host with the failed OSD in
-the event you need to use onsite support or replace hardware.
-
-Similarly, CRUSH may help you identify faults more quickly. For example, if all
-OSDs in a particular rack go down simultaneously, the fault may lie with a
-network switch or power to the rack or the network switch rather than the
-OSDs themselves.
-
-A custom CRUSH map can also help you identify the physical locations where
-Ceph stores redundant copies of data when the placement group(s) associated
-with a failed host are in a degraded state.
-
-`Inktank`_ provides excellent premium support for developing CRUSH maps.
-
-.. _Inktank: http://www.inktank.com
-
-.. note:: Lines of code in example boxes may extend past the edge of the box.
- Please scroll when reading or copying longer examples.
-
-Editing a CRUSH Map
-===================
-
-To edit an existing CRUSH map:
-
-#. `Get the CRUSH Map`_.
-#. `Decompile`_ the CRUSH Map.
-#. Edit at least one of `Devices`_, `Buckets`_ and `Rules`_.
-#. `Recompile`_ the CRUSH Map.
-#. `Set the CRUSH Map`_.
-
-To activate CRUSH Map rules for a specific pool, identify the common ruleset
-number for those rules and specify that ruleset number for the pool. See `Set
-Pool Values`_ for details.
-
-.. _Get the CRUSH Map: #getcrushmap
-.. _Decompile: #decompilecrushmap
-.. _Devices: #crushmapdevices
-.. _Buckets: #crushmapbuckets
-.. _Rules: #crushmaprules
-.. _Recompile: #compilecrushmap
-.. _Set the CRUSH Map: #setcrushmap
-.. _Set Pool Values: ../pools#setpoolvalues
-
-.. _getcrushmap:
-
-Get a CRUSH Map
----------------
-
-To get the CRUSH Map for your cluster, execute the following::
-
- ceph osd getcrushmap -o {compiled-crushmap-filename}
-
-Ceph will output (-o) a compiled CRUSH Map to the filename you specified. Since
-the CRUSH Map is in a compiled form, you must decompile it first before you can
-edit it.
-
-.. _decompilecrushmap:
-
-Decompile a CRUSH Map
----------------------
-
-To decompile a CRUSH Map, execute the following::
-
- crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
-
-Ceph will decompile (-d) the compiled CRUSH map and output (-o) it to the
-filename you specified.
-
-
-.. _compilecrushmap:
-
-Compile a CRUSH Map
--------------------
-
-To compile a CRUSH Map, execute the following::
-
- crushtool -c {decompiled-crush-map-filename} -o {compiled-crush-map-filename}
-
-Ceph will store a compiled CRUSH map to the filename you specified.
-
-
-.. _setcrushmap:
-
-Set a CRUSH Map
----------------
-
-To set the CRUSH Map for your cluster, execute the following::
-
- ceph osd setcrushmap -i {compiled-crushmap-filename}
-
-Ceph will input the compiled CRUSH Map of the filename you specified as the
-CRUSH Map for the cluster.
-
-
-
-CRUSH Map Parameters
-====================
-
-There are three main sections to a CRUSH Map.
-
-#. Devices consist of any object storage device--i.e., the hard disk
- corresponding to a ``ceph-osd`` daemon.
-#. Buckets consist of a hierarchical aggregation of storage locations
- (e.g., rows, racks, hosts, etc.) and their assigned weights.
-#. Rules consist of the manner of selecting buckets
-
-
-.. _crushmapdevices:
-
-CRUSH Map Devices
------------------
-
-To map placement groups to OSDs, a CRUSH Map requires a list of OSD devices
-(i.e., the name of the OSD daemon). The list of devices appears first in the
-CRUSH Map. ::
-
- #devices
- device {num} {osd.name}
-
-For example::
-
- #devices
- device 0 osd.0
- device 1 osd.1
- device 2 osd.2
- device 3 osd.3
-
-As a general rule, an OSD daemon maps to a single disk or to a RAID.
-
-
-.. _crushmapbuckets:
-
-CRUSH Map Buckets
------------------
-
-CRUSH maps support the notion of 'buckets', which may be thought of as nodes
-that aggregate other buckets into a hierarchy of physical locations, where OSD
-devices are the leaves of the hierarchy. The following table lists the default
-types.
-
-+------+----------+-------------------------------------------------------+
-| Type | Location | Description |
-+======+=============+====================================================+
-| 0 | OSD | An OSD daemon (e.g., osd.1, osd.2, etc). |
-+------+-------------+----------------------------------------------------+
-| 1 | Host | A host name containing one or more OSDs. |
-+------+-------------+----------------------------------------------------+
-| 2 | Rack | A computer rack. The default is ``unknownrack``. |
-+------+-------------+----------------------------------------------------+
-| 3 | Row | A row in a series of racks. |
-+------+-------------+----------------------------------------------------+
-| 4 | Room | A room containing racks and rows of hosts. |
-+------+-------------+----------------------------------------------------+
-| 5 | Data Center | A physical data center containing rooms. |
-+------+-------------+----------------------------------------------------+
-| 6 | Pool | A data storage pool for storing objects. |
-+------+-------------+----------------------------------------------------+
-
-.. tip:: You can remove these types and create your own bucket types.
-
-Ceph's deployment tools generate a CRUSH map that contains a bucket for each
-host, and a pool named "default," which is useful for the default ``data``,
-``metadata`` and ``rbd`` pools. The remaining bucket types provide a means for
-storing information about the physical location of nodes/buckets, which makes
-cluster administration much easier when OSDs, hosts, or network hardware
-malfunction and the administrator needs access to physical hardware.
-
-.. tip: The term "bucket" used in the context of CRUSH means a Ceph pool, a
- location, or a piece of physical hardware. It is a different concept from
- the term "bucket" when used in the context of RADOS Gateway APIs.
-
-A bucket has a type, a unique name (string), a unique ID expressed as a negative
-integer, a weight relative to the total capacity/capability of its item(s), the
-bucket algorithm (``straw`` by default), and the hash (``0`` by default, reflecting
-CRUSH Hash ``rjenkins1``). A bucket may have one or more items. The items may
-consist of other buckets or OSDs. Items may have a weight that reflects the
-relative weight of the item.
-
-::
-
- [bucket-type] [bucket-name] {
- id [a unique negative numeric ID]
- weight [the relative capacity/capability of the item(s)]
- alg [the bucket type: uniform | list | tree | straw ]
- hash [the hash type: 0 by default]
- item [item-name] weight [weight]
- }
-
-The following example illustrates how you can use buckets to aggregate a pool and
-physical locations like a datacenter, a room, a rack and a row. ::
-
- host ceph-osd-server-1 {
- id -17
- alg straw
- hash 0
- item osd.0 weight 1.00
- item osd.1 weight 1.00
- }
-
- row rack-1-row-1 {
- id -16
- alg straw
- hash 0
- item ceph-osd-server-1 2.00
- }
-
- rack rack-3 {
- id -15
- alg straw
- hash 0
- item rack-3-row-1 weight 2.00
- item rack-3-row-2 weight 2.00
- item rack-3-row-3 weight 2.00
- item rack-3-row-4 weight 2.00
- item rack-3-row-5 weight 2.00
- }
-
- rack rack-2 {
- id -14
- alg straw
- hash 0
- item rack-2-row-1 weight 2.00
- item rack-2-row-2 weight 2.00
- item rack-2-row-3 weight 2.00
- item rack-2-row-4 weight 2.00
- item rack-2-row-5 weight 2.00
- }
-
- rack rack-1 {
- id -13
- alg straw
- hash 0
- item rack-1-row-1 weight 2.00
- item rack-1-row-2 weight 2.00
- item rack-1-row-3 weight 2.00
- item rack-1-row-4 weight 2.00
- item rack-1-row-5 weight 2.00
- }
-
- room server-room-1 {
- id -12
- alg straw
- hash 0
- item rack-1 weight 10.00
- item rack-2 weight 10.00
- item rack-3 weight 10.00
- }
-
- datacenter dc-1 {
- id -11
- alg straw
- hash 0
- item server-room-1 weight 30.00
- item server-room-2 weight 30.00
- }
-
- pool data {
- id -10
- alg straw
- hash 0
- item dc-1 weight 60.00
- item dc-2 weight 60.00
- }
-
-.. _crushmaprules:
-
-CRUSH Map Rules
----------------
-
-CRUSH maps support the notion of 'CRUSH rules', which are the rules that
-determine data placement for a pool. For large clusters, you will likely create
-many pools where each pool may have its own CRUSH ruleset and rules. The default
-CRUSH map has a rule for each pool, and one ruleset assigned to each of the
-default pools, which include:
-
-- ``data``
-- ``metadata``
-- ``rbd``
-
-.. note:: In most cases, you will not need to modify the default rules. When
- you create a new pool, its default ruleset is ``0``.
-
-A rule takes the following form::
-
- rule [rulename] {
-
- ruleset [ruleset]
- type [type]
- min_size [min-size]
- max_size [max-size]
- step [step]
-
- }
-
-
-``ruleset``
-
-:Description: A means of classifying a rule as belonging to a set of rules. Activated by `setting the ruleset in a pool`_.
-:Purpose: A component of the rule mask.
-:Type: Integer
-:Required: Yes
-:Default: 0
-
-.. _setting the ruleset in a pool: ../pools#setpoolvalues
-
-
-``type``
-
-:Description: Describes a rule for either a hard disk (replicated) or a RAID.
-:Purpose: A component of the rule mask.
-:Type: String
-:Required: Yes
-:Default: ``replicated``
-:Valid Values: Currently only ``replicated``
-
-``min_size``
-
-:Description: If a placement group makes fewer replicas than this number, CRUSH will NOT select this rule.
-:Type: Integer
-:Purpose: A component of the rule mask.
-:Required: Yes
-:Default: ``1``
-
-``max_size``
-
-:Description: If a placement group makes more replicas than this number, CRUSH will NOT select this rule.
-:Type: Integer
-:Purpose: A component of the rule mask.
-:Required: Yes
-:Default: 10
-
-
-``step take {bucket}``
-
-:Description: Takes a bucket name, and begins iterating down the tree.
-:Purpose: A component of the rule.
-:Required: Yes
-:Example: ``step take data``
-
-
-``step choose firstn {num} type {bucket-type}``
-
-:Description: Selects the number of buckets of the given type. Where ``N`` is the number of options available, if ``{num} > 0 && < N``, choose that many buckets; if ``{num} < 0``, it means ``N - {num}``; and, if ``{num} == 0``, choose ``N`` buckets (all available).
-:Purpose: A component of the rule.
-:Prerequisite: Follows ``step take`` or ``step choose``.
-:Example: ``step choose firstn 1 type row``
-
-
-``step emit``
-
-:Description: Outputs the current value and empties the stack. Typically used at the end of a rule, but may also be used to from different trees in the same rule.
-:Purpose: A component of the rule.
-:Prerequisite: Follows ``step choose``.
-:Example: ``step emit``
-
-.. important:: To activate one or more rules with a common ruleset number to a pool, set the ruleset number to the pool.
-
-
-.. _addosd:
-
-Add/Move an OSD
-===============
-
-To add or move an OSD in the CRUSH map of a running cluster, execute the
-following::
-
- ceph osd crush set {id} {name} {weight} pool={pool-name} [{bucket-type}={bucket-name}, ...]
-
-Where:
-
-``id``
-
-:Description: The numeric ID of the OSD.
-:Type: Integer
-:Required: Yes
-:Example: ``0``
-
-
-``name``
-
-:Description: The full name of the OSD.
-:Type: String
-:Required: Yes
-:Example: ``osd.0``
-
-
-``weight``
-
-:Description: The CRUSH weight for the OSD.
-:Type: Double
-:Required: Yes
-:Example: ``2.0``
-
-
-``pool``
-
-:Description: By default, the CRUSH hierarchy contains the pool name at its root.
-:Type: Key/value pair.
-:Required: Yes
-:Example: ``pool=data``
-
-
-``bucket-type``
-
-:Description: You may specify the OSD's location in the CRUSH hierarchy.
-:Type: Key/value pairs.
-:Required: No
-:Example: ``datacenter=dc1, room=room1, row=foo, rack=bar, host=foo-bar-1``
-
-
-The following example adds ``osd.0`` to the hierarchy, or moves the OSD from a
-previous location. ::
-
- ceph osd crush set 0 osd.0 1.0 pool=data datacenter=dc1, room=room1, row=foo, rack=bar, host=foo-bar-1
-
-
-Adjust an OSD's CRUSH Weight
-============================
-
-To adjust an OSD's crush weight in the CRUSH map of a running cluster, execute
-the following::
-
- ceph osd crush reweight {name} {weight}
-
-Where:
-
-``name``
-
-:Description: The full name of the OSD.
-:Type: String
-:Required: Yes
-:Example: ``osd.0``
-
-
-``weight``
-
-:Description: The CRUSH weight for the OSD.
-:Type: Double
-:Required: Yes
-:Example: ``2.0``
-
-
-.. _removeosd:
-
-Remove an OSD
-=============
-
-To remove an OSD from the CRUSH map of a running cluster, execute the following::
-
- ceph osd crush remove {name}
-
-Where:
-
-``name``
-
-:Description: The full name of the OSD.
-:Type: String
-:Required: Yes
-:Example: ``osd.0``
-
-
-Move a Bucket
-=============
-
-To move a bucket to a different location or position in the CRUSH map hierarchy,
-execute the following::
-
- ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
-
-Where:
-
-``bucket-name``
-
-:Description: The name of the bucket to move/reposition.
-:Type: String
-:Required: Yes
-:Example: ``foo-bar-1``
-
-``bucket-type``
-
-:Description: You may specify the bucket's location in the CRUSH hierarchy.
-:Type: Key/value pairs.
-:Required: No
-:Example: ``datacenter=dc1, room=room1, row=foo, rack=bar, host=foo-bar-1``
-
-
-Tunables
-========
-
-.. versionadded:: 0.48
-
-There are several magic numbers that were used in the original CRUSH
-implementation that have proven to be poor choices. To support
-the transition away from them, newer versions of CRUSH (starting with
-the v0.48 argonaut series) allow the values to be adjusted or tuned.
-
-Clusters running recent Ceph releases support using the tunable values
-in the CRUSH maps. However, older clients and daemons will not correctly interact
-with clusters using the "tuned" CRUSH maps. To detect this situation,
-there is now a feature bit ``CRUSH_TUNABLES`` (value 0x40000) to
-reflect support for tunables.
-
-If the OSDMap currently used by the ``ceph-mon`` or ``ceph-osd``
-daemon has non-legacy values, it will require the ``CRUSH_TUNABLES``
-feature bit from clients and daemons who connect to it. This means
-that old clients will not be able to connect.
-
-At some future point in time, newly created clusters will have
-improved default values for the tunables. This is a matter of waiting
-until the support has been present in the Linux kernel clients long
-enough to make this a painless transition for most users.
-
-Impact of Legacy Values
------------------------
-
-The legacy values result in several misbehaviors:
-
- * For hiearchies with a small number of devices in the leaf buckets,
- some PGs map to fewer than the desired number of replicas. This
- commonly happens for hiearchies with "host" nodes with a small
- number (1-3) of OSDs nested beneath each one.
-
- * For large clusters, some small percentages of PGs map to less than
- the desired number of OSDs. This is more prevalent when there are
- several layers of the hierarchy (e.g., row, rack, host, osd).
-
- * When some OSDs are marked out, the data tends to get redistributed
- to nearby OSDs instead of across the entire hierarchy.
-
-Which client versions support tunables
---------------------------------------
-
- * argonaut series, v0.48.1 or later
- * v0.49 or later
- * Linux kernel version v3.5 or later (for the file system and RBD kernel clients)
-
-A few important points
-----------------------
-
- * Adjusting these values will result in the shift of some PGs between
- storage nodes. If the Ceph cluster is already storing a lot of
- data, be prepared for some fraction of the data to move.
- * The ``ceph-osd`` and ``ceph-mon`` daemons will start requiring the
- ``CRUSH_TUNABLES`` feature of new connections as soon as they get
- the updated map. However, already-connected clients are
- effectively grandfathered in, and will misbehave if they do not
- support the new feature.
- * If the CRUSH tunables are set to non-legacy values and then later
- changed back to the defult values, ``ceph-osd`` daemons will not be
- required to support the feature. However, the OSD peering process
- requires examining and understanding old maps. Therefore, you
- should not run old (pre-v0.48) versions of the ``ceph-osd`` daemon
- if the cluster has previosly used non-legacy CRUSH values, even if
- the latest version of the map has been switched back to using the
- legacy defaults.
-
-Tuning CRUSH
-------------
-
-If you can ensure that all clients are running recent code, you can
-adjust the tunables by extracting the CRUSH map, modifying the values,
-and reinjecting it into the cluster.
-
-* Extract the latest CRUSH map::
-
- ceph osd getcrushmap -o /tmp/crush
-
-* Adjust tunables. These values appear to offer the best behavior
- for both large and small clusters we tested with. You will need to
- additionally specify the ``--enable-unsafe-tunables`` argument to
- ``crushtool`` for this to work. Please use this option with
- extreme care.::
-
- crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
-
-* Reinject modified map::
-
- ceph osd setcrushmap -i /tmp/crush.new
-
-Legacy values
--------------
-
-For reference, the legacy values for the CRUSH tunables can be set
-with::
-
- crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 -o /tmp/crush.legacy
-
-Again, the special ``--enable-unsafe-tunables`` option is required.
-Further, as noted above, be careful running old versions of the
-``ceph-osd`` daemon after reverting to legacy values as the feature
-bit is not perfectly enforced.
-
+++ /dev/null
-=========================
- Data Placement Overview
-=========================
-
-Ceph stores, replicates and rebalances data objects across a RADOS cluster
-dynamically. With many different users storing objects in different pools for
-different purposes on countless OSDs, Ceph operations require some data
-placement planning. The main data placement planning concepts in Ceph include:
-
-- **Pools:** Ceph stores data within pools, which are logical groups for storing
- objects. Pools manage the number of placement groups, the number of replicas,
- and the ruleset for the pool. To store data in a pool, you must have
- an authenticated user with permissions for the pool. Ceph can snapshot pools.
- Future versions of Ceph will support namespaces within pools.
-
-- **Placement Groups:** Ceph maps objects to placement groups (PGs).
- Placement groups (PGs) are shards or fragments of a logical object pool
- that place objects as a group into OSDs. Placement groups reduce the amount
- of per-object metadata when Ceph stores the data in OSDs. A larger number of
- placement groups (e.g., 100 per OSD) leads to better balancing.
-
-- **CRUSH Maps:** CRUSH is a big part of what allows Ceph to scale without
- performance bottlenecks, without limitations to scalability, and without a
- single point of failure. CRUSH maps provide the physical topology of the
- cluster to the CRUSH algorithm to determine where the data for an object
- and its replicas should be stored, and how to do so across failure domains
- for added data safety among other things.
-
-When you initially set up a test cluster, you can use the default values. Once
-you begin planning for a large Ceph cluster, refer to pools, placement groups
-and CRUSH for data placement operations. If you find some aspects challenging,
-`Inktank`_ provides excellent premium support for Ceph.
-
-.. _Inktank: http://www.inktank.com
\ No newline at end of file
+++ /dev/null
-=======================
- Debugging and Logging
-=======================
-
-You may view Ceph log files under ``/var/log/ceph`` (the default location).
-
-Ceph is still on the leading edge, so you may encounter situations that require
-using Ceph's debugging and logging. To activate and configure Ceph's debug
-logging, refer to `Ceph Logging and Debugging`_. For additional logging
-settings, refer to the `Logging and Debugging Config Reference`_.
-
-.. _Ceph Logging and Debugging: ../../config-cluster/ceph-conf#ceph-logging-and-debugging
-.. _Logging and Debugging Config Reference: ../../config-cluster/log-and-debug-ref
-
-You can change the logging settings at runtime so that you don't have to
-stop and restart the cluster. Refer to `Ceph Configuration - Runtime Changes`_
-for additional details.
-
-Debugging may also require you to track down memory and threading issues.
-You can run a single daemon, a type of daemon, or the whole cluster with
-Valgrind. You should only use Valgrind when developing or debugging Ceph.
-Valgrind is computationally expensive, and will slow down your system otherwise.
-Valgrind messages are logged to ``stderr``.
-
-.. _Ceph Configuration - Runtime Changes: ../../config-cluster/ceph-conf#ceph-runtime-config
+++ /dev/null
-====================
- Cluster Operations
-====================
-
-.. raw:: html
-
- <table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>High-level Operations</h3>
-
-High-level cluster operations consist primarily of starting, stopping, and
-restarting a cluster with the ``ceph`` service; checking the cluster's health;
-and, monitoring an operating cluster.
-
-.. toctree::
-
- operating
- monitoring
- troubleshooting
- debug
-
-.. raw:: html
-
- </td><td><h3>Data Placement</h3>
-
-Once you have your cluster up and running, you may begin working with data
-placement. Ceph supports petabyte-scale data storage clusters, with storage
-pools and placement groups that distribute data across the cluster using Ceph's
-CRUSH algorithm.
-
-.. toctree::
-
- data-placement
- pools
- placement-groups
- crush-map
-
-
-
-.. raw:: html
-
- </td></tr><tr><td><h3>Authentication and Authorization</h3>
-
-Once you have data placement policies in place, you can begin creating users
-and assigning them capabilities, such as the ability to read and write data
-to one or more pools, or the cluster as a whole.
-
-.. toctree::
-
- Cephx Overview <auth-intro>
- authentication
-
-
-
-.. raw:: html
-
- </td><td><h3>Daemon Operations</h3>
-
-Low-level cluster operations consist of starting, stopping, and restarting a
-particular daemon within a cluster; changing the settings of a particular
-daemon or subsystem; and, adding a daemon to the cluster or removing a daemon
-from the cluster. The most common use cases for low-level operations include
-growing or shrinking the Ceph cluster and replacing legacy or failed hardware
-with new hardware.
-
-.. toctree::
-
- add-or-rm-osds
- add-or-rm-mons
- Command Reference <control>
-
-
-.. raw:: html
-
- </td></tr></tbody></table>
-
+++ /dev/null
-======================
- Monitoring a Cluster
-======================
-
-Once you have a running cluster, you may use the ``ceph`` tool to monitor your
-cluster. Monitoring a cluster typically involves checking OSD status, monitor
-status, placement group status and metadata server status.
-
-Interactive Mode
-================
-
-To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line
-with no arguments. For example::
-
- ceph
- ceph> health
- ceph> status
- ceph> quorum_status
- ceph> mon_status
-
-
-Checking Cluster Health
-=======================
-
-After you start your cluster, and before you start reading and/or
-writing data, check your cluster's health first. You can check on the
-health of your Ceph cluster with the following::
-
- ceph health
-
-If you specified non-default locations for your configuration or keyring,
-you may specify their locations::
-
- ceph -c /path/to/conf -k /path/to/keyring health
-
-Upon starting the Ceph cluster, you will likely encounter a health
-warning such as ``HEALTH_WARN XXX num placement groups stale``. Wait a few moments and check
-it again. When your cluster is ready, ``ceph health`` should return a message
-such as ``HEALTH_OK``. At that point, it is okay to begin using the cluster.
-
-Watching a Cluster
-==================
-
-To watch the cluster's ongoing events, open a new terminal. Then, enter::
-
- ceph -w
-
-Ceph will print each version of the placement group map and their status. For
-example, a tiny Ceph cluster consisting of one monitor, one metadata server and
-two OSDs may print the following::
-
- health HEALTH_OK
- monmap e1: 1 mons at {a=192.168.0.1:6789/0}, election epoch 0, quorum 0 a
- osdmap e13: 2 osds: 2 up, 2 in
- placement groupmap v9713: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
- mdsmap e4: 1/1/1 up {0=a=up:active}
-
- 2012-08-01 11:33:53.831268 mon.0 [INF] placement groupmap v9712: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
- 2012-08-01 11:35:31.904650 mon.0 [INF] placement groupmap v9713: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
- 2012-08-01 11:35:53.903189 mon.0 [INF] placement groupmap v9714: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
- 2012-08-01 11:37:31.865809 mon.0 [INF] placement groupmap v9715: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
-
-
-Checking a Cluster's Status
-===========================
-
-To check a cluster's status, execute the following::
-
- ceph status
-
-Or::
-
- ceph -s
-
-In interactive mode, type ``status`` and press **Enter**. ::
-
- ceph> status
-
-Ceph will print the cluster status. For example, a tiny Ceph cluster consisting
-of one monitor, one metadata server and two OSDs may print the following::
-
- health HEALTH_OK
- monmap e1: 1 mons at {a=192.168.0.1:6789/0}, election epoch 0, quorum 0 a
- osdmap e13: 2 osds: 2 up, 2 in
- placement groupmap v9754: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
- mdsmap e4: 1/1/1 up {0=a=up:active}
-
-
-Checking OSD Status
-===================
-
-An OSD's status is either in the cluster (``in``) or out of the
-cluster (``out``); and, it is either up and running (``up``), or it is down and
-not running (``down``). If an OSD is ``up``, it may be either ``in`` in the
-cluster (you can read and write data) or it is out of the cluster ``out``. If
-it is ``down``, it should also be ``out``. If an OSD is ``down`` and ``in``,
-there is a problem.
-
-.. ditaa:: +----------------+ +----------------+
- | | | |
- | OSD #n In | | OSD #n Up |
- | | | |
- +----------------+ +----------------+
- ^ ^
- | |
- | |
- v v
- +----------------+ +----------------+
- | | | |
- | OSD #n Out | | OSD #n Down |
- | | | |
- +----------------+ +----------------+
-
-You can check OSDs to ensure they are ``up`` and ``in`` by executing::
-
- ceph osd stat
-
-Or::
-
- ceph osd dump
-
-You can also check view OSDs according to their position in the CRUSH map. ::
-
- ceph osd tree
-
-Ceph will print out a CRUSH tree with a host, its OSDs, whether they are up
-and their weight. ::
-
- # id weight type name up/down reweight
- -1 3 pool default
- -3 3 rack mainrack
- -2 3 host osd-host
- 0 1 osd.0 up 1
- 1 1 osd.1 up 1
- 2 1 osd.2 up 1
-
-
-Checking Monitor Status
-=======================
-
-If your cluster has multiple monitors (likely), you should check the monitor
-quorum status after you start the cluster before reading and/or writing data. A
-quorum must be present when multiple monitors are running. You should also check
-monitor status periodically to ensure that they are running.
-
-To see display the monitor map, execute the following::
-
- ceph mon stat
-
-Or::
-
- ceph mon dump
-
-To check the quorum status for the monitor cluster, execute the following::
-
- ceph quorum_status
-
-Ceph will return the quorum status. For example, a Ceph cluster consisting of
-three monitors may return the following:
-
-.. code-block:: javascript
-
- { "election_epoch": 10,
- "quorum": [
- 0,
- 1,
- 2],
- "monmap": { "epoch": 1,
- "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
- "modified": "2011-12-12 13:28:27.505520",
- "created": "2011-12-12 13:28:27.505520",
- "mons": [
- { "rank": 0,
- "name": "a",
- "addr": "127.0.0.1:6789\/0"},
- { "rank": 1,
- "name": "b",
- "addr": "127.0.0.1:6790\/0"},
- { "rank": 2,
- "name": "c",
- "addr": "127.0.0.1:6791\/0"}
- ]
- }
- }
-
-Checking MDS Status
-===================
-
-Metadata servers provide metadata services for Ceph FS. Metadata servers have
-two sets of states: ``up | down`` and ``active | inactive``. To ensure your
-metadata servers are ``up`` and ``active``, execute the following::
-
- ceph mds stat
-
-To display details of the metadata cluster, execute the following::
-
- ceph mds dump
-
-
-Checking Placement Group States
-===============================
-
-Placement groups map objects to OSDs. When you monitor your
-placement groups, you will want them to be ``active`` and ``clean``. For other
-PG states, see `Placement Group States`_.
-
-.. _Placement Group States: ../pg-states
+++ /dev/null
-=====================
- Operating a Cluster
-=====================
-
-The ``ceph`` service provides functionality to **start**, **restart**, and
-**stop** your Ceph cluster. Each time you execute ``ceph`` processes, you
-must specify at least one option and one command. You may also specify a daemon
-type or a daemon instance. For most newer Debian/Ubuntu distributions, you may
-use the following syntax::
-
- sudo service ceph [options] [commands] [daemons]
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
-
- sudo /etc/init.d/ceph [options] [commands] [daemons]
-
-The ``ceph`` service options include:
-
-+-----------------+----------+-------------------------------------------------+
-| Option | Shortcut | Description |
-+=================+==========+=================================================+
-| ``--verbose`` | ``-v`` | Use verbose logging. |
-+-----------------+----------+-------------------------------------------------+
-| ``--valgrind`` | ``N/A`` | (Dev and QA only) Use `Valgrind`_ debugging. |
-+-----------------+----------+-------------------------------------------------+
-| ``--allhosts`` | ``-a`` | Execute on all hosts in ``ceph.conf.`` |
-| | | Otherwise, it only executes on ``localhost``. |
-+-----------------+----------+-------------------------------------------------+
-| ``--restart`` | ``N/A`` | Automatically restart daemon if it core dumps. |
-+-----------------+----------+-------------------------------------------------+
-| ``--norestart`` | ``N/A`` | Don't restart a daemon if it core dumps. |
-+-----------------+----------+-------------------------------------------------+
-| ``--conf`` | ``-c`` | Use an alternate configuration file. |
-+-----------------+----------+-------------------------------------------------+
-
-The ``ceph`` service commands include:
-
-+------------------+------------------------------------------------------------+
-| Command | Description |
-+==================+============================================================+
-| ``start`` | Start the daemon(s). |
-+------------------+------------------------------------------------------------+
-| ``stop`` | Stop the daemon(s). |
-+------------------+------------------------------------------------------------+
-| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9`` |
-+------------------+------------------------------------------------------------+
-| ``killall`` | Kill all daemons of a particular type. |
-+------------------+------------------------------------------------------------+
-| ``cleanlogs`` | Cleans out the log directory. |
-+------------------+------------------------------------------------------------+
-| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
-+------------------+------------------------------------------------------------+
-
-For subsystem operations, the ``ceph`` service can target specific daemon types by
-adding a particular daemon type for the ``[daemons]`` option. Daemon types include:
-
-- ``mon``
-- ``osd``
-- ``mds``
-
-The ``ceph`` service's ``[daemons]`` setting may also target a specific instance::
-
- sudo /etc/init.d/ceph -a start osd.0
-
-Where ``osd.0`` is the first OSD in the cluster.
-
-
-Starting a Cluster
-==================
-
-To start your Ceph cluster, execute ``ceph`` with the ``start`` command.
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax::
-
- sudo service ceph start [options] [start|restart] [daemonType|daemonID]
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
-
- sudo /etc/init.d/ceph [options] [start|restart] [daemonType|daemonID]
-
-The following examples illustrates a typical use case::
-
- sudo service ceph -a start
- sudo /etc/init.d/ceph -a start
-
-Once you execute with ``-a``, Ceph should begin operating. You may also specify
-a particular daemon instance to constrain the command to a single instance. For
-example::
-
- sudo /etc/init.d/ceph start osd.0
-
-
-Stopping a Cluster
-==================
-
-To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command.
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax::
-
- sudo service ceph [options] stop [daemonType|daemonID]
-
-For example::
-
- sudo service -a ceph stop
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
-
- sudo /etc/init.d/ceph -a stop
-
-Ceph should shut down the operating processes.
-
-
-.. _Valgrind: http://www.valgrind.org/
\ No newline at end of file
+++ /dev/null
-==========================
- Placement Group Concepts
-==========================
-
-When you execute commands like ``ceph -w``, ``ceph osd dump``, and other
-commands related to placement groups, Ceph may return values using some
-of the following terms:
-
-*Peering*
- The process of bringing all of the OSDs that store
- a Placement Group (PG) into agreement about the state
- of all of the objects (and their metadata) in that PG.
- Note that agreeing on the state does not mean that
- they all have the latest contents.
-
-*Acting Set*
- The ordered list of OSDs who are (or were as of some epoch)
- responsible for a particular placement group.
-
-*Up Set*
- The ordered list of OSDs responsible for a particular placment
- group for a particular epoch according to CRUSH. Normally this
- is the same as the *Acting Set*, except when the *Acting Set* has
- been explicitly overridden via ``pg_temp`` in the OSD Map.
-
-*Current Interval* or *Past Interval*
- A sequence of OSD map epochs during which the *Acting Set* and *Up
- Set* for particular placement group do not change.
-
-*Primary*
- The member (and by convention first) of the *Acting Set*,
- that is responsible for coordination peering, and is
- the only OSD that will accept client-initiated
- writes to objects in a placement group.
-
-*Replica*
- A non-primary OSD in the *Acting Set* for a placement group
- (and who has been recognized as such and *activated* by the primary).
-
-*Stray*
- An OSD that is not a member of the current *Acting Set*, but
- has not yet been told that it can delete its copies of a
- particular placement group.
-
-*Recovery*
- Ensuring that copies of all of the objects in a placement group
- are on all of the OSDs in the *Acting Set*. Once *Peering* has
- been performed, the *Primary* can start accepting write operations,
- and *Recovery* can proceed in the background.
-
-*PG Info*
- Basic metadata about the placement group's creation epoch, the version
- for the most recent write to the placement group, *last epoch started*,
- *last epoch clean*, and the beginning of the *current interval*. Any
- inter-OSD communication about placement groups includes the *PG Info*,
- such that any OSD that knows a placement group exists (or once existed)
- also has a lower bound on *last epoch clean* or *last epoch started*.
-
-*PG Log*
- A list of recent updates made to objects in a placement group.
- Note that these logs can be truncated after all OSDs
- in the *Acting Set* have acknowledged up to a certain
- point.
-
-*Missing Set*
- Each OSD notes update log entries and if they imply updates to
- the contents of an object, adds that object to a list of needed
- updates. This list is called the *Missing Set* for that ``<OSD,PG>``.
-
-*Authoritative History*
- A complete, and fully ordered set of operations that, if
- performed, would bring an OSD's copy of a placement group
- up to date.
-
-*Epoch*
- A (monotonically increasing) OSD map version number
-
-*Last Epoch Start*
- The last epoch at which all nodes in the *Acting Set*
- for a particular placement group agreed on an
- *Authoritative History*. At this point, *Peering* is
- deemed to have been successful.
-
-*up_thru*
- Before a *Primary* can successfully complete the *Peering* process,
- it must inform a monitor that is alive through the current
- osd map *Epoch* by having the monitor set its *up_thru* in the osd
- map. This helps *Peering* ignore previous *Acting Sets* for which
- *Peering* never completed after certain sequences of failures, such as
- the second interval below:
-
- - *acting set* = [A,B]
- - *acting set* = [A]
- - *acting set* = [] very shortly after (e.g., simultaneous failure, but staggered detection)
- - *acting set* = [B] (B restarts, A does not)
-
-*Last Epoch Clean*
- The last *Epoch* at which all nodes in the *Acting set*
- for a particular placement group were completely
- up to date (both placement group logs and object contents).
- At this point, *recovery* is deemed to have been
- completed.
+++ /dev/null
-========================
- Placement Group States
-========================
-
-When checking a cluster's status (e.g., running ``ceph -w`` or ``ceph -s``),
-Ceph will report on the status of the placement groups. A placement group has
-one or more states. The optimum state for placement groups in the placement group
-map is ``active + clean``.
-
-*Creating*
- Ceph is still creating the placement group.
-
-*Active*
- Ceph will process requests to the placement group.
-
-*Clean*
- Ceph replicated all objects in the placement group the correct number of times.
-
-*Down*
- A replica with necessary data is down, so the placement group is offline.
-
-*Replay*
- The placement group is waiting for clients to replay operations after an OSD crashed.
-
-*Splitting*
- Ceph is splitting the placment group into multiple placement groups. (functional?)
-
-*Scrubbing*
- Ceph is checking the placement group for inconsistencies.
-
-*Degraded*
- Ceph has not replicated some objects in the placement group the correct number of times yet.
-
-*Inconsistent*
- Ceph detects inconsistencies in the one or more replicas of an object in the placement group
- (e.g. objects are the wrong size, objects are missing from one replica *after* recovery finished, etc.).
-
-*Peering*
- The placement group is undergoing the peering process
-
-*Repair*
- Ceph is checking the placement group and repairing any inconsistencies it finds (if possible).
-
-*Recovering*
- Ceph is migrating/synchronizing objects and their replicas.
-
-*Backfill*
- Ceph is scanning and synchronizing the entire contents of a placement group
- instead of inferring what contents need to be synchronized from the logs of
- recent operations. *Backfill* is a special case of recovery.
-
-*Wait-backfill*
- The placement group is waiting in line to start backfill.
-
-*Incomplete*
- Ceph detects that a placement group is missing a necessary period of history
- from its log. If you see this state, report a bug, and try to start any
- failed OSDs that may contain the needed information.
-
-*Stale*
- The placement group is in an unknown state - the monitors have not received
- an update for it since the placement group mapping changed.
-
-*Remapped*
- The placement group is temporarily mapped to a different set of OSDs from what
- CRUSH specified.
+++ /dev/null
-==================
- Placement Groups
-==================
-
-A Placement Group (PG) aggregates a series of objects into a group, and maps the
-group to a series of OSDs. Tracking object placement and object metadata on a
-per-object basis is computationally expensive--i.e., a system with millions of
-objects cannot realistically track placement on a per-object basis. Placement
-groups address this barrier to performance and scalability. Additionally,
-placement groups reduce the number of processes and the amount of per-object
-metadata Ceph must track when storing and retrieving data.
-
-.. ditaa::
- /-----\ /-----\ /-----\ /-----\ /-----\
- | obj | | obj | | obj | | obj | | obj |
- \-----/ \-----/ \-----/ \-----/ \-----/
- | | | | |
- +--------+--------+ +---+----+
- | |
- v v
- +-----------------------+ +-----------------------+
- | Placement Group #1 | | Placement Group #2 |
- | | | |
- +-----------------------+ +-----------------------+
- | |
- | +-----------------------+---+
- +------+------+-------------+ |
- | | | |
- v v v v
- /----------\ /----------\ /----------\ /----------\
- | | | | | | | |
- | OSD #1 | | OSD #2 | | OSD #3 | | OSD #4 |
- | | | | | | | |
- \----------/ \----------/ \----------/ \----------/
-
-Each placement group requires some amount of system resources:
-
-- **Directly**: Each PG requires some amount of memory and CPU.
-- **Indirectly**: The total number of PGs increases the peering count.
-
-Increasing the number of placement groups reduces the variance in per-OSD load
-across your cluster. We recommend approximately 50-100 placement groups per OSD
-to balance out memory and CPU requirements and per-OSD load. For a single pool
-of objects, you can use the following formula::
-
- (OSDs * 100)
- Total PGs = ------------
- Replicas
-
-When using multiple data pools for storing objects, you need to ensure that you
-balance the number of placement groups per pool with the number of placement
-groups per OSD so that you arrive at a reasonable total number of placement
-groups that provides reasonably low variance per OSD without taxing system
-resources or making the peering process too slow.
-
-.. _setting the number of placement groups:
-
-Set the Number of Placement Groups
-==================================
-
-To set the number of placement groups in a pool, you must specify the
-number of placement groups at the time you create the pool.
-
-See `Create a Pool`_ for details.
-
-.. _Create a Pool: ../pools#createpool
-
-Get the Number of Placement Groups
-==================================
-
-To get the number of placement groups in a pool, execute the following::
-
- ceph osd pool get {pool-name} pg_num
-
-
-Get a Cluster's PG Statistics
-=============================
-
-To get the statistics for the placement groups in your cluster, execute the following::
-
- ceph pg dump [--format {format}]
-
-Valid formats are ``plain`` (default) and ``json``.
-
-
-Get Statistics for Stuck PGs
-============================
-
-To get the statistics for all placement groups stuck in a specified state,
-execute the following::
-
- ceph pg dump_stuck inactive|unclean|stale [--format <format>] [-t|--threshold <seconds>]
-
-**Inactive** Placement groups cannot process reads or writes because they are waiting for an OSD
-with the most up-to-date data to come up and in.
-
-**Unclean** Placement groups contain objects that are not replicated the desired number
-of times. They should be recovering.
-
-**Stale** Placement groups are in an unknown state - the OSDs that host them have not
-reported to the monitor cluster in a while (configured by ``mon_osd_report_timeout``).
-
-Valid formats are ``plain`` (default) and ``json``. The threshold defines the minimum number
-of seconds the placement group is stuck before including it in the returned statistics
-(default 300 seconds).
-
-
-Get a PG Map
-============
-
-To get the placement group map for a particular placement group, execute the following::
-
- ceph pg map {pg-id}
-
-For example::
-
- ceph pg map 1.6c
-
-Ceph will return the placement group map, the placement group, and the OSD status::
-
- osdmap e13 pg 1.6c (1.6c) -> up [1,0] acting [1,0]
-
-
-Get a PGs Statistics
-====================
-
-To retrieve statistics for a particular placement group, execute the following::
-
- ceph pg {pg-id} query
-
-
-Scrub a Placement Group
-=======================
-
-To scrub a placement group, execute the following::
-
- ceph pg scrub {pg-id}
-
-Ceph checks the primary and any replica nodes, generates a catalog of all objects
-in the placement group and compares them to ensure that no objects are missing
-or mismatched, and their contents are consistent. Assuming the replicas all
-match, a final semantic sweep ensures that all of the snapshot-related object
-metadata is consistent. Errors are reported via logs.
-
-
-Revert Lost
-===========
-
-If the cluster has lost one or more objects, and you have decided to
-abandon the search for the lost data, you must mark the unfound objects
-as ``lost``.
-
-If all possible locations have been queried and objects are still
-lost, you may have to give up on the lost objects. This is
-possible given unusual combinations of failures that allow the cluster
-to learn about writes that were performed before the writes themselves
-are recovered.
-
-Currently the only supported option is "revert", which will either roll back to
-a previous version of the object or (if it was a new object) forget about it
-entirely. To mark the "unfound" objects as "lost", execute the following::
-
- ceph pg {pg-id} mark_unfound_lost revert
-
-.. important:: Use this feature with caution, because it may confuse
- applications that expect the object(s) to exist.
-
-
-.. toctree::
- :hidden:
-
- pg-states
- pg-concepts
+++ /dev/null
-=======
- Pools
-=======
-
-When you first deploy a cluster without creating a pool, Ceph uses the default
-pools for storing data. A pool differs from CRUSH's location-based buckets in
-that a pool doesn't have a single physical location, and a pool provides you
-with some additional functionality, including:
-
-- **Replicas**: You can set the desired number of copies/replicas of an object.
- A typical configuration stores an object and one additional copy
- (i.e., ``size = 2``), but you can determine the number of copies/replicas.
-
-- **Placement Groups**: You can set the number of placement groups for the pool.
- A typical configuration uses approximately 100 placement groups per OSD to
- provide optimal balancing without using up too many computing resources. When
- setting up multiple pools, be careful to ensure you set a reasonable number of
- placement groups for both the pool and the cluster as a whole.
-
-- **CRUSH Rules**: When you store data in a pool, a CRUSH ruleset mapped to the
- pool enables CRUSH to identify a rule for the placement of the primary object
- and object replicas in your cluster. You can create a custom CRUSH rule for your
- pool.
-
-- **Snapshots**: When you create snapshots with ``ceph osd pool mksnap``,
- you effectively take a snapshot of a particular pool.
-
-- **Set Ownership**: You can set a user ID as the owner of a pool.
-
-To organize data into pools, you can list, create, and remove pools.
-You can also view the utilization statistics for each pool.
-
-
-List Pools
-==========
-
-To list your cluster's pools, execute::
-
- ceph osd lspools
-
-The default pools include:
-
-- ``data``
-- ``metadata``
-- ``rbd``
-
-
-.. _createpool:
-
-Create a Pool
-=============
-
-To create a pool, execute::
-
- ceph osd pool create {pool-name} {pg-num} [{pgp-num}]
-
-Where:
-
-``{pool-name}``
-
-:Description: The name of the pool. It must be unique.
-:Type: String
-:Required: Yes
-
-``{pg-num}``
-
-:Description: The total number of placement groups for the pool
-:Type: Integer
-:Required: No
-
-``{pgp-num}``
-
-:Description: The total number of placement groups for placement purposes.
-:Type: Integer
-:Required: No
-
-When you create a pool, you should consider setting the number of
-placement groups.
-
-.. important:: You cannot change the number of placement groups in a pool
- after you create it.
-
-See `Placement Groups`_ for details on calculating an appropriate number of
-placement groups for your pool.
-
-.. _Placement Groups: ../placement-groups
-
-
-Delete a Pool
-=============
-
-To delete a pool, execute::
-
- ceph osd pool delete {pool-name}
-
-
-If you created your own rulesets and rules for a pool you created, you should
-consider removing them when you no longer need your pool. If you created users
-with permissions strictly for a pool that no longer exists, you should consider
-deleting those users too.
-
-
-Rename a Pool
-=============
-
-To rename a pool, execute::
-
- ceph osd pool rename {current-pool-name} {new-pool-name}
-
-If you rename a pool and you have per-pool capabilities for an authenticated
-user, you must update the user's capabilities (i.e., caps) with the new pool
-name.
-
-.. note: Version ``0.48`` Argonaut and above.
-
-Show Pool Statistics
-====================
-
-To show a pool's utilization statistics, execute::
-
- rados df
-
-
-Make a Snapshot of a Pool
-=========================
-
-To make a snapshot of a pool, execute::
-
- ceph osd pool mksnap {pool-name} {snap-name}
-
-.. note: Version ``0.48`` Argonaut and above.
-
-
-Remove a Snapshot of a Pool
-===========================
-
-To remove a snapshot of a pool, execute::
-
- ceph osd pool rmsnap {pool-name} {snap-name}
-
-.. note: Version ``0.48`` Argonaut and above.
-
-.. _setpoolvalues:
-
-Set Pool Values
-===============
-
-To set a value to a pool, execute the following::
-
- ceph osd pool set {pool-name} {key} {value}
-
-You may set values for the following keys:
-
-``size``
-
-:Description: Sets the number of replicas for objects in the pool. See `Set the Number of Object Replicas`_ for further details.
-:Type: Integer
-
-``min_size``
-
-:Description: Sets the minimum number of replicas required for io. See `Set the Number of Object Replicas`_ for further details
-:Type: Integer
-
-.. note: Version ``0.54`` and above
-
-``crash_replay_interval``
-
-:Description: The number of seconds to allow clients to replay acknowledged, but uncommitted requests.
-:Type: Integer
-
-
-``pgp_num``
-
-:Description: The effective number of placement groups to use when calculating data placement.
-:Type: Integer
-:Valid Range: Equal to or less than ``pg_num``.
-
-
-``crush_ruleset``
-
-:Description: The ruleset to use for mapping object placement in the cluster.
-:Type: Integer
-
-
-.. note: Version ``0.48`` Argonaut and above.
-
-
-Get Pool Values
-===============
-
-To set a value to a pool, execute the following::
-
- ceph osd pool get {pool-name} {key}
-
-
-``pg_num``
-
-:Description: The number of placement groups for the pool.
-:Type: Integer
-
-
-``pgp_num``
-
-:Description: The effective number of placement groups to use when calculating data placement.
-:Type: Integer
-:Valid Range: Equal to or less than ``pg_num``.
-
-
-Set the Number of Object Replicas
-=================================
-
-To set the number of object replicas, execute the following::
-
- ceph osd pool set {poolname} size {num-replicas}
-
-.. important: The ``{num-replicas}`` includes the object itself.
- If you want the object and two copies of the object for a total of
- three instances of the object, specify ``3``.
-
-For example::
-
- ceph osd pool set data size 3
-
-You may execute this command for each pool.
-
-Note, however, that pool size is more of a best-effort setting: an object
-might accept ios in degraded mode with fewer than size replicas. To
-set a minimum number of required replicas for io, you should use the
-min_size setting.
-
-For example::
-
- ceph osd pool set data min_size 2
-
-This ensures that no object in the data pool will receive io with fewer than
-min_size replicas.
-
-
-Get the Number of Object Replicas
-=================================
-
-To get the number of object replicas, execute the following::
-
- ceph osd dump | grep 'rep size'
-
-Ceph will list the pools, with the ``rep size`` attribute highlighted.
-By default, Ceph creates two replicas of an object (two copies).
+++ /dev/null
-==================================
- Recovering from Monitor Failures
-==================================
-
-In production clusters, we recommend running the cluster with a minimum
-of three monitors. The failure of a single monitor should not take down
-the entire monitor cluster, provided a majority of the monitors remain
-available. If the majority of nodes are available, the remaining nodes
-will be able to form a quorum.
-
-When you check your cluster's health, you may notice that a monitor
-has failed. For example::
-
- ceph health
- HEALTH_WARN 1 mons down, quorum 0,2
-
-For additional detail, you may check the cluster status::
-
- ceph status
- HEALTH_WARN 1 mons down, quorum 0,2
- mon.b (rank 1) addr 192.168.106.220:6790/0 is down (out of quorum)
-
-In most cases, you can simply restart the affected node.
-For example::
-
- service ceph -a restart {failed-mon}
-
-If there are not enough monitors to form a quorum, the ``ceph``
-command will block trying to reach the cluster. In this situation,
-you need to get enough ``ceph-mon`` daemons running to form a quorum
-before doing anything else with the cluster.
\ No newline at end of file
+++ /dev/null
-==============================
- Recovering from OSD Failures
-==============================
-
-Single OSD Failure
-==================
-
-When a ``ceph-osd`` process dies, the monitor will learn about the failure
-from surviving ``ceph-osd`` daemons and report it via the ``ceph health``
-command::
-
- ceph health
- HEALTH_WARN 1/3 in osds are down
-
-Specifically, you will get a warning whenever there are ``ceph-osd``
-processes that are marked ``in`` and ``down``. You can identify which
-``ceph-osds`` are ``down`` with::
-
- ceph health detail
- HEALTH_WARN 1/3 in osds are down
- osd.0 is down since epoch 23, last address 192.168.106.220:6800/11080
-
-Under normal circumstances, simply restarting the ``ceph-osd`` daemon will
-allow it to rejoin the cluster and recover. If there is a disk
-failure or other fault preventing ``ceph-osd`` from functioning or
-restarting, an error message should be present in its log file in
-``/var/log/ceph``.
-
-If the daemon stopped because of a heartbeat failure, the underlying
-kernel file system may be unresponsive. Check ``dmesg`` output for disk
-or other kernel errors.
-
-If the problem is a software error (failed assertion or other
-unexpected error), it should be reported to the :ref:`mailing list
-<mailing-list>`.
-
-
-The Cluster Has No Free Disk Space
-==================================
-
-If the cluster fills up, the monitor will prevent new data from being
-written. The system puts ``ceph-osds`` in two categories: ``nearfull``
-and ``full``, with configurable threshholds for each (80% and 90% by
-default). In both cases, full ``ceph-osds`` will be reported by ``ceph health``::
-
- ceph health
- HEALTH_WARN 1 nearfull osds
- osd.2 is near full at 85%
-
-Or::
-
- ceph health
- HEALTH_ERR 1 nearfull osds, 1 full osds
- osd.2 is near full at 85%
- osd.3 is full at 97%
-
-The best way to deal with a full cluster is to add new ``ceph-osds``,
-allowing the cluster to redistribute data to the newly available
-storage.
-
-
-Homeless Placement Groups
-=========================
-
-It is possible for all OSDs that had copies of a given placement groups to fail.
-If that's the case, that subset of the object store is unavailable, and the
-monitor will receive no status updates for those placement groups. To detect
-this situation, the monitor marks any placement group whose primary OSD has
-failed as ``stale``. For example::
-
- ceph health
- HEALTH_WARN 24 pgs stale; 3/300 in osds are down
-
-You can identify which placement groups are ``stale``, and what the last OSDs to
-store them were, with::
-
- ceph health detail
- HEALTH_WARN 24 pgs stale; 3/300 in osds are down
- ...
- pg 2.5 is stuck stale+active+remapped, last acting [2,0]
- ...
- osd.10 is down since epoch 23, last address 192.168.106.220:6800/11080
- osd.11 is down since epoch 13, last address 192.168.106.220:6803/11539
- osd.12 is down since epoch 24, last address 192.168.106.220:6806/11861
-
-If we want to get placement group 2.5 back online, for example, this tells us that
-it was last managed by ``osd.0`` and ``osd.2``. Restarting those ``ceph-osd``
-daemons will allow the cluster to recover that placement group (and, presumably,
-many others).
-
-
-Stuck Placement Groups
-======================
-
-It is normal for placement groups to enter states like "degraded" or "peering"
-following a failure. Normally these states indicate the normal progression
-through the failure recovery process. However, if a placement group stays in one
-of these states for a long time this may be an indication of a larger problem.
-For this reason, the monitor will warn when placement groups get "stuck" in a
-non-optimal state. Specifically, we check for:
-
-* ``inactive`` - The placement group has not been ``active`` for too long
- (i.e., it hasn't been able to service read/write requests).
-
-* ``unclean`` - The placement group has not been ``clean`` for too long
- (i.e., it hasn't been able to completely recover from a previous failure).
-
-* ``stale`` - The placement group status has not been updated by a ``ceph-osd``,
- indicating that all nodes storing this placement group may be ``down``.
-
-You can explicitly list stuck placement groups with one of::
-
- ceph pg dump_stuck stale
- ceph pg dump_stuck inactive
- ceph pg dump_stuck unclean
-
-For stuck ``stale`` placement groups, it is normally a matter of getting the
-right ``ceph-osd`` daemons running again. For stuck ``inactive`` placement
-groups, it is usually a peering problem (see :ref:`failures-osd-peering`). For
-stuck ``unclean`` placement groups, there is usually something preventing
-recovery from completing, like unfound objects (see
-:ref:`failures-osd-unfound`);
-
-
-.. _failures-osd-peering:
-
-Placement Group Down - Peering Failure
-======================================
-
-In certain cases, the ``ceph-osd`` `Peering` process can run into
-problems, preventing a PG from becoming active and usable. For
-example, ``ceph health`` might report::
-
- ceph health detail
- HEALTH_ERR 7 pgs degraded; 12 pgs down; 12 pgs peering; 1 pgs recovering; 6 pgs stuck unclean; 114/3300 degraded (3.455%); 1/3 in osds are down
- ...
- pg 0.5 is down+peering
- pg 1.4 is down+peering
- ...
- osd.1 is down since epoch 69, last address 192.168.106.220:6801/8651
-
-We can query the cluster to determine exactly why the PG is marked ``down`` with::
-
- ceph pg 0.5 query
-
-.. code-block:: javascript
-
- { "state": "down+peering",
- ...
- "recovery_state": [
- { "name": "Started\/Primary\/Peering\/GetInfo",
- "enter_time": "2012-03-06 14:40:16.169679",
- "requested_info_from": []},
- { "name": "Started\/Primary\/Peering",
- "enter_time": "2012-03-06 14:40:16.169659",
- "probing_osds": [
- 0,
- 1],
- "blocked": "peering is blocked due to down osds",
- "down_osds_we_would_probe": [
- 1],
- "peering_blocked_by": [
- { "osd": 1,
- "current_lost_at": 0,
- "comment": "starting or marking this osd lost may let us proceed"}]},
- { "name": "Started",
- "enter_time": "2012-03-06 14:40:16.169513"}
- ]
- }
-
-The ``recovery_state`` section tells us that peering is blocked due to
-down ``ceph-osd`` daemons, specifically ``osd.1``. In this case, we can start that ``ceph-osd``
-and things will recover.
-
-Alternatively, if there is a catastrophic failure of ``osd.1`` (e.g., disk
-failure), we can tell the cluster that it is ``lost`` and to cope as
-best it can.
-
-.. important:: This is dangerous in that the cluster cannot
- guarantee that the other copies of the data are consistent
- and up to date.
-
-To instruct Ceph to continue anyway::
-
- ceph osd lost 1
-
-Recovery will proceed.
-
-
-.. _failures-osd-unfound:
-
-Unfound Objects
-===============
-
-Under certain combinations of failures Ceph may complain about
-``unfound`` objects::
-
- ceph health detail
- HEALTH_WARN 1 pgs degraded; 78/3778 unfound (2.065%)
- pg 2.4 is active+degraded, 78 unfound
-
-This means that the storage cluster knows that some objects (or newer
-copies of existing objects) exist, but it hasn't found copies of them.
-One example of how this might come about for a PG whose data is on ceph-osds
-1 and 2:
-
-* 1 goes down
-* 2 handles some writes, alone
-* 1 comes up
-* 1 and 2 repeer, and the objects missing on 1 are queued for recovery.
-* Before the new objects are copied, 2 goes down.
-
-Now 1 knows that these object exist, but there is no live ``ceph-osd`` who
-has a copy. In this case, IO to those objects will block, and the
-cluster will hope that the failed node comes back soon; this is
-assumed to be preferable to returning an IO error to the user.
-
-First, you can identify which objects are unfound with::
-
- ceph pg 2.4 list_missing [starting offset, in json]
-
-.. code-block:: javascript
-
- { "offset": { "oid": "",
- "key": "",
- "snapid": 0,
- "hash": 0,
- "max": 0},
- "num_missing": 0,
- "num_unfound": 0,
- "objects": [
- { "oid": "object 1",
- "key": "",
- "hash": 0,
- "max": 0 },
- ...
- ],
- "more": 0}
-
-If there are too many objects to list in a single result, the ``more``
-field will be true and you can query for more. (Eventually the
-command line tool will hide this from you, but not yet.)
-
-Second, you can identify which OSDs have been probed or might contain
-data::
-
- ceph pg 2.4 query
-
-.. code-block:: javascript
-
- "recovery_state": [
- { "name": "Started\/Primary\/Active",
- "enter_time": "2012-03-06 15:15:46.713212",
- "might_have_unfound": [
- { "osd": 1,
- "status": "osd is down"}]},
-
-In this case, for example, the cluster knows that ``osd.1`` might have
-data, but it is ``down``. The full range of possible states include::
-
- * already probed
- * querying
- * osd is down
- * not queried (yet)
-
-Sometimes it simply takes some time for the cluster to query possible
-locations.
-
-It is possible that there are other locations where the object can
-exist that are not listed. For example, if a ceph-osd is stopped and
-taken out of the cluster, the cluster fully recovers, and due to some
-future set of failures ends up with an unfound object, it won't
-consider the long-departed ceph-osd as a potential location to
-consider. (This scenario, however, is unlikely.)
-
-If all possible locations have been queried and objects are still
-lost, you may have to give up on the lost objects. This, again, is
-possible given unusual combinations of failures that allow the cluster
-to learn about writes that were performed before the writes themselves
-are recovered. To mark the "unfound" objects as "lost"::
-
- ceph pg 2.5 mark_unfound_lost revert
-
-This the final argument specifies how the cluster should deal with
-lost objects. Currently the only supported option is "revert", which
-will either roll back to a previous version of the object or (if it
-was a new object) forget about it entirely. Use this with caution, as
-it may confuse applications that expected the object to exist.
-
-
-
-Slow or Unresponsive OSD
-========================
-
-If, for some reason, a ``ceph-osd`` is slow to respond to a request, it will
-generate log messages complaining about requests that are taking too
-long. The warning threshold defaults to 30 seconds, and is configurable
-via the ``osd op complaint time`` option. When this happens, the cluster
-log will receive messages like::
-
- osd.0 192.168.106.220:6800/18813 312 : [WRN] old request osd_op(client.5099.0:790 fatty_26485_object789 [write 0~4096] 2.5e54f643) v4 received at 2012-03-06 15:42:56.054801 currently waiting for sub ops
-
-Possible causes include:
-
- * bad disk (check ``dmesg`` output)
- * kernel file system bug (check ``dmesg`` output)
- * overloaded cluster (check system load, iostat, etc.)
- * ceph-osd bug
-
-
-Flapping OSDs
-=============
-
-If something is causing OSDs to "flap" (repeatedly getting marked ``down`` and then
-``up`` again), you can force the monitors to stop with::
-
- ceph osd set noup # prevent osds from getting marked up
- ceph osd set nodown # prevent osds from getting marked down
-
-These flags are recorded in the osdmap structure::
-
- ceph osd dump | grep flags
- flags no-up,no-down
-
-You can clear the flags with::
-
- ceph osd unset noup
- ceph osd unset nodown
-
-Two other flags are supported, ``noin`` and ``noout``, which prevent
-booting OSDs from being marked ``in`` (allocated data) or down
-ceph-osds from eventually being marked ``out`` (regardless of what the
-current value for ``mon osd down out interval`` is).
-
-Note that ``noup``, ``noout``, and ``noout`` are temporary in the
-sense that once the flags are cleared, the action they were blocking
-should occur shortly after. The ``noin`` flag, on the other hand,
-prevents ceph-osds from being marked in on boot, and any daemons that
-started while the flag was set will remain that way.
+++ /dev/null
-=================
- Troubleshooting
-=================
-
-When monitoring your cluster, you may receive health warnings and you may also
-notice that not all of your daemons are running properly. The following
-sections will help you identify and resolve daemon operations issues.
-
-.. toctree::
-
- OSD Failures <troubleshooting-osd>
- MON Failures <troubleshooting-mon>
\ No newline at end of file
--- /dev/null
+==============
+ Librados (C)
+==============
+
+.. highlight:: c
+
+`Librados` provides low-level access to the RADOS service. For an
+overview of RADOS, see :doc:`/architecture`.
+
+
+Example: connecting and writing an object
+=========================================
+
+To use `Librados`, you instantiate a :c:type:`rados_t` variable (a cluster handle) and
+call :c:func:`rados_create()` with a pointer to it::
+
+ int err;
+ rados_t cluster;
+
+ err = rados_create(&cluster, NULL);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot create a cluster handle: %s\n", argv[0], strerror(-err));
+ exit(1);
+ }
+
+Then you configure your :c:type:`rados_t` to connect to your cluster,
+either by setting individual values (:c:func:`rados_conf_set()`),
+using a configuration file (:c:func:`rados_conf_read_file()`), using
+command line options (:c:func:`rados_conf_parse_argv`), or an
+environment variable (:c:func:`rados_conf_parse_env()`)::
+
+ err = rados_conf_read_file(cluster, "/path/to/myceph.conf");
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot read config file: %s\n", argv[0], strerror(-err));
+ exit(1);
+ }
+
+Once the cluster handle is configured, you can connect to the cluster with :c:func:`rados_connect()`::
+
+ err = rados_connect(cluster);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot connect to cluster: %s\n", argv[0], strerror(-err));
+ exit(1);
+ }
+
+Then you open an "IO context", a :c:type:`rados_ioctx_t`, with :c:func:`rados_ioctx_create()`::
+
+ rados_ioctx_t io;
+ char *poolname = "mypool";
+
+ err = rados_ioctx_create(cluster, poolname, &io);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot open rados pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_shutdown(cluster);
+ exit(1);
+ }
+
+Note that the pool you try to access must exist.
+
+Then you can use the RADOS data manipulation functions, for example
+write into an object called ``greeting`` with
+:c:func:`rados_write_full()`::
+
+ err = rados_write_full(io, "greeting", "hello", 5);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot write pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ }
+
+In the end, you'll want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`::
+
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+
+
+Asychronous IO
+==============
+
+When doing lots of IO, you often don't need to wait for one operation
+to complete before starting the next one. `Librados` provides
+asynchronous versions of several operations:
+
+* :c:func:`rados_aio_write`
+* :c:func:`rados_aio_append`
+* :c:func:`rados_aio_write_full`
+* :c:func:`rados_aio_read`
+
+For each operation, you must first create a
+:c:type:`rados_completion_t` that represents what to do when the
+operation is safe or complete by calling
+:c:func:`rados_aio_create_completion`. If you don't need anything
+special to happen, you can pass NULL::
+
+ rados_completion_t comp;
+ err = rados_aio_create_completion(NULL, NULL, NULL, &comp);
+ if (err < 0) {
+ fprintf(stderr, "%s: could not create aio completion: %s\n", argv[0], strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ }
+
+Now you can call any of the aio operations, and wait for it to
+be in memory or on disk on all replicas::
+
+ err = rados_aio_write(io, "foo", comp, "bar", 3, 0);
+ if (err < 0) {
+ fprintf(stderr, "%s: could not schedule aio write: %s\n", argv[0], strerror(-err));
+ rados_aio_release(comp);
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ }
+ rados_wait_for_complete(comp); // in memory
+ rados_wait_for_safe(comp); // on disk
+
+Finally, we need to free the memory used by the completion with :c:func:`rados_aio_release`::
+
+ rados_aio_release(comp);
+
+You can use the callbacks to tell your application when writes are
+durable, or when read buffers are full. For example, if you wanted to
+measure the latency of each operation when appending to several
+objects, you could schedule several writes and store the ack and
+commit time in the corresponding callback, then wait for all of them
+to complete using :c:func:`rados_aio_flush` before analyzing the
+latencies::
+
+ typedef struct {
+ struct timeval start;
+ struct timeval ack_end;
+ struct timeval commit_end;
+ } req_duration;
+
+ void ack_callback(rados_completion_t comp, void *arg) {
+ req_duration *dur = (req_duration *) arg;
+ gettimeofday(&dur->ack_end, NULL);
+ }
+
+ void commit_callback(rados_completion_t comp, void *arg) {
+ req_duration *dur = (req_duration *) arg;
+ gettimeofday(&dur->commit_end, NULL);
+ }
+
+ int output_append_latency(rados_ioctx_t io, const char *data, size_t len, size_t num_writes) {
+ req_duration times[num_writes];
+ rados_completion_t comps[num_writes];
+ for (size_t i = 0; i < num_writes; ++i) {
+ gettimeofday(×[i].start, NULL);
+ int err = rados_aio_create_completion((void*) ×[i], ack_callback, commit_callback, &comps[i]);
+ if (err < 0) {
+ fprintf(stderr, "Error creating rados completion: %s\n", strerror(-err));
+ return err;
+ }
+ char obj_name[100];
+ snprintf(obj_name, sizeof(obj_name), "foo%ld", (unsigned long)i);
+ err = rados_aio_append(io, obj_name, comps[i], data, len);
+ if (err < 0) {
+ fprintf(stderr, "Error from rados_aio_append: %s", strerror(-err));
+ return err;
+ }
+ }
+ // wait until all requests finish *and* the callbacks complete
+ rados_aio_flush(io);
+ // the latencies can now be analyzed
+ printf("Request # | Ack latency (s) | Commit latency (s)\n");
+ for (size_t i = 0; i < num_writes; ++i) {
+ // don't forget to free the completions
+ rados_aio_release(comps[i]);
+ struct timeval ack_lat, commit_lat;
+ timersub(×[i].ack_end, ×[i].start, &ack_lat);
+ timersub(×[i].commit_end, ×[i].start, &commit_lat);
+ printf("%9ld | %8ld.%06ld | %10ld.%06ld\n", (unsigned long) i, ack_lat.tv_sec, ack_lat.tv_usec, commit_lat.tv_sec, commit_lat.tv_usec);
+ }
+ return 0;
+ }
+
+Note that all the :c:type:`rados_completion_t` must be freed with :c:func:`rados_aio_release` to avoid leaking memory.
+
+
+API calls
+=========
+
+ .. doxygenfile:: librados.h
--- /dev/null
+==================
+ LibradosPP (C++)
+==================
+
+.. todo:: write me!
--- /dev/null
+==========================
+ Adding/Removing Monitors
+==========================
+
+When you have a cluster up and running, you may add or remove monitors
+from the cluster at runtime.
+
+Adding Monitors
+===============
+
+Ceph monitors are light-weight processes that maintain a master copy of the
+cluster map. You can run a cluster with 1 monitor. We recommend at least 3
+monitors for a production cluster. Ceph monitors use PAXOS to establish
+consensus about the master cluster map, which requires a majority of
+monitors running to establish a quorum for consensus about the cluster map
+(e.g., 1; 3 out of 5; 4 out of 6; etc.).
+
+Since monitors are light-weight, it is possible to run them on the same
+host as an OSD; however, we recommend running them on separate hosts.
+
+.. important:: A *majority* of monitors in your cluster must be able to
+ reach each other in order to establish a quorum.
+
+Deploy your Hardware
+--------------------
+
+If you are adding a new host when adding a new monitor, see `Hardware
+Recommendations`_ for details on minimum recommendations for monitor hardware.
+To add a monitor host to your cluster, first make sure you have an up-to-date
+version of Linux installed (typically Ubuntu 12.04 precise).
+
+Add your monitor host to a rack in your cluster, connect it to the network
+and ensure that it has network connectivity.
+
+.. _Hardware Recommendations: ../../install/hardware-recommendations
+
+Install the Required Software
+-----------------------------
+
+For manually deployed clusters, you must install Ceph packages
+manually. See `Installing Debian/Ubuntu Packages`_ for details.
+You should configure SSH to a user with password-less authentication
+and root permissions.
+
+.. _Installing Debian/Ubuntu Packages: ../../install/debian
+
+For clusters deployed with Chef, create a `chef user`_, `configure
+SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
+`Installing Chef`_ for details.
+
+.. _chef user: ../../install/chef#createuser
+.. _configure SSH keys: ../../install/chef#genkeys
+.. _install the Chef client: ../../install/chef#installchef
+.. _Installing Chef: ../../install/chef
+.. _install Ruby: ../../install/chef#installruby
+
+.. _adding-mon:
+
+Adding a Monitor (Manual)
+-------------------------
+
+This procedure creates a ``ceph-mon`` data directory, retrieves the monitor map
+and monitor keyring, and adds a ``ceph-mon`` daemon to your cluster. If
+this results in only two monitor daemons, you may add more monitors by
+repeating this procedure until you have a sufficient number of ``ceph-mon``
+daemons to achieve a quorum.
+
+#. Create the default directory on your new monitor. ::
+
+ ssh {new-mon-host}
+ sudo mkdir /var/lib/ceph/mon/ceph-{mon-letter}
+
+#. Create a temporary directory ``{tmp}`` to keep the files needed during
+ this process. This directory should be different from monitor's default
+ directory created in the previous step, and can be removed after all the
+ steps are taken. ::
+
+ mkdir {tmp}
+
+#. Retrieve the keyring for your monitors, where ``{tmp}`` is the path to
+ the retrieved keyring, and ``{filename}`` is the name of the file containing
+ the retrieved monitor key. ::
+
+ ceph auth get mon. -o {tmp}/{filename}
+
+#. Retrieve the monitor map, where ``{tmp}`` is the path to
+ the retrieved monitor map, and ``{filename}`` is the name of the file
+ containing the retrieved monitor monitor map. ::
+
+ ceph mon getmap -o {tmp}/{filename}
+
+#. Prepare the monitor's data directory created in the first step. You must
+ specify the path to the monitor map so that you can retrieve the
+ information about a quorum of monitors and their ``fsid``. You must also
+ specify a path to the monitor keyring::
+
+ sudo ceph-mon -i {mon-letter} --mkfs --monmap {tmp}/{filename} --keyring {tmp}/{filename}
+
+
+#. Add a ``[mon.{letter}]`` entry for your new monitor in your ``ceph.conf`` file. ::
+
+ [mon.c]
+ host = new-mon-host
+ addr = ip-addr:6789
+
+#. Add the new monitor to the list of monitors for you cluster (runtime). This enables
+ other nodes to use this monitor during their initial startup. ::
+
+ ceph mon add <name> <ip>[:<port>]
+
+#. Start the new monitor and it will automatically join the cluster.
+ The daemon needs to know which address to bind to, either via
+ ``--public-addr {ip:port}`` or by setting ``mon addr`` in the
+ appropriate section of ``ceph.conf``. For example::
+
+ ceph-mon -i newname --public-addr {ip:port}
+
+
+Removing Monitors
+=================
+
+When you remove monitors from a cluster, consider that Ceph monitors use
+PAXOS to establish consensus about the master cluster map. You must have
+a sufficient number of monitors to establish a quorum for consensus about
+the cluster map.
+
+Removing a Monitor (Manual)
+---------------------------
+
+This procedure removes a ``ceph-mon`` daemon from your cluster. If this
+procedure results in only two monitor daemons, you may add or remove another
+monitor until you have a number of ``ceph-mon`` daemons that can achieve a
+quorum.
+
+#. Stop the monitor. ::
+
+ service ceph -a stop mon.{mon-letter}
+
+#. Remove the monitor from the cluster. ::
+
+ ceph mon remove {mon-letter}
+
+#. Remove the monitor entry from ``ceph.conf``.
+
+
+Removing Monitors from an Unhealthy Cluster
+-------------------------------------------
+
+This procedure removes a ``ceph-mon`` daemon from an unhealhty cluster--i.e.,
+a cluster that has placement groups that are persistently not ``active + clean``.
+
+
+#. Identify a surviving monitor. ::
+
+ ceph mon dump
+
+#. Navigate to a surviving monitor's ``monmap`` directory. ::
+
+ ssh {mon-host}
+ cd /var/lib/ceph/mon/ceph-{mon-letter}/monmap
+
+#. List the directory contents and identify the last commmitted map.
+ Directory contents will show a numeric list of maps. ::
+
+ ls
+ 1 2 3 4 5 first_committed last_committed last_pn latest
+
+
+#. Identify the most recently committed map. ::
+
+ sudo cat last_committed
+
+#. Copy the most recently committed file to a temporary directory. ::
+
+ cp /var/lib/ceph/mon/ceph-{mon-letter}/monmap/{last_committed} /tmp/surviving_map
+
+#. Remove the non-surviving monitors. For example, if you have three monitors,
+ ``mon.a``, ``mon.b``, and ``mon.c``, where only ``mon.a`` will survive, follow
+ the example below::
+
+ monmaptool /tmp/surviving_map --rm {mon-letter}
+ #for example
+ monmaptool /tmp/surviving_map --rm b
+ monmaptool /tmp/surviving_map --rm c
+
+#. Stop all monitors. ::
+
+ service ceph -a stop mon
+
+#. Inject the surviving map with the removed monitors into the surviving monitors.
+ For example, to inject a map into monitor ``mon.a``, follow the example below::
+
+ ceph-mon -i {mon-letter} --inject-monmap {map-path}
+ #for example
+ ceph-mon -i a --inject-monmap /etc/surviving_map
--- /dev/null
+======================
+ Adding/Removing OSDs
+======================
+
+When you have a cluster up and running, you may add OSDs or remove OSDs
+from the cluster at runtime.
+
+Adding OSDs
+===========
+
+When you want to expand a cluster, you may add an OSD at runtime. With Ceph, an
+OSD is generally one Ceph ``ceph-osd`` daemon for one storage disk within a host
+machine. If your host has multiple storage disks, you may map one ``ceph-osd``
+daemon for each disk.
+
+Generally, it's a good idea to check the capacity of your cluster to see if you
+are reaching the upper end of its capacity. As your cluster reaches its ``near
+full`` ratio, you should add one or more OSDs to expand your cluster's capacity.
+
+.. warning:: Do not let your cluster reach its ``full ratio`` before
+ adding an OSD. OSD failures that occur after the cluster reaches
+ its ``near full`` ratio may cause the cluster to exceed its
+ ``full ratio``.
+
+Deploy your Hardware
+--------------------
+
+If you are adding a new host when adding a new OSD,
+see `Hardware Recommendations`_ for details on minimum recommendations
+for OSD hardware. To add a OSD host to your cluster, first make sure you have
+an up-to-date version of Linux installed (typically Ubuntu 12.04 precise),
+and you have made some initial preparations for your storage disks.
+See `Filesystem Recommendations`_ for details.
+
+Add your OSD host to a rack in your cluster, connect it to the network
+and ensure that it has network connectivity.
+
+.. _Hardware Recommendations: ../../install/hardware-recommendations
+.. _Filesystem Recommendations: ../../config-cluster/file-system-recommendations
+
+Install the Required Software
+-----------------------------
+
+For manually deployed clusters, you must install Ceph packages
+manually. See `Installing Debian/Ubuntu Packages`_ for details.
+You should configure SSH to a user with password-less authentication
+and root permissions.
+
+.. _Installing Debian/Ubuntu Packages: ../../install/debian
+
+For clusters deployed with Chef, create a `chef user`_, `configure
+SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
+`Installing Chef`_ for details.
+
+.. _chef user: ../../install/chef#createuser
+.. _configure SSH keys: ../../install/chef#genkeys
+.. _install the Chef client: ../../install/chef#installchef
+.. _Installing Chef: ../../install/chef
+.. _Install Ruby: ../../install/chef#installruby
+
+Adding an OSD (Manual)
+----------------------
+
+This procedure sets up an ``ceph-osd`` daemon, configures it to use one disk,
+and configures the cluster to distribute data to the OSD. If your host has
+multiple disks, you may add an OSD for each disk by repeating this procedure.
+
+To add an OSD, create a data directory for it, mount a disk to that directory,
+add the OSD to your configuration file, add the OSD to the cluster, and then
+add it to the CRUSH map.
+
+When you add the OSD to the CRUSH map, consider the weight you give to the new
+OSD. Hard disk capacity grows 40% per year, so newer OSD hosts may have larger
+hard disks than older hosts in the cluster (i.e., they may have greater weight).
+
+#. Create the default directory on your new OSD. ::
+
+ ssh {new-osd-host}
+ sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
+
+
+#. If the OSD is for a disk other than the OS disk, prepare it
+ for use with Ceph, and mount it to the directory you just created::
+
+ ssh {new-osd-host}
+ sudo mkfs -t {fstype} /dev/{disk}
+ sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
+
+
+#. Navigate to the host where you keep the master copy of the cluster's
+ ``ceph.conf`` file. ::
+
+ ssh {admin-host}
+ cd /etc/ceph
+ vim ceph.conf
+
+#. Add the new OSD to your ``ceph.conf`` file.
+
+ .. code-block:: ini
+
+ [osd.123]
+ host = {hostname}
+
+#. From the host where you keep the master copy of the cluster's
+ ``ceph.conf`` file, copy the updated ``ceph.conf`` file to your
+ new OSD's ``/etc/ceph`` directory and to other hosts in your cluster. ::
+
+ ssh {new-osd} sudo tee /etc/ceph/ceph.conf < /etc/ceph/ceph.conf
+
+#. Create the OSD. ::
+
+ ceph osd create {osd-num}
+ ceph osd create 123 #for example
+
+#. Initialize the OSD data directory. ::
+
+ ssh {new-osd-host}
+ ceph-osd -i {osd-num} --mkfs --mkkey
+
+ The directory must be empty before you can run ``ceph-osd``.
+
+#. Register the OSD authentication key. The value of ``ceph`` for
+ ``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your
+ cluster name differs from ``ceph``, use your cluster name instead.::
+
+ ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
+
+#. Add the OSD to the CRUSH map so that it can begin receiving data. You may
+ also decompile the CRUSH map, add the OSD to the device list, add the host as a
+ bucket (if it's not already in the CRUSH map), add the device as an item in the
+ host, assign it a weight, recompile it and set it. See `Add/Move an OSD`_ for
+ details. ::
+
+ ceph osd crush set {id} {name} {weight} [{bucket-type}={bucket-name}, ...]
+
+
+.. topic:: Argonaut (v0.48) Best Practices
+
+ To limit impact on user I/O performance, add an OSD to the CRUSH map
+ with an initial weight of ``0``. Then, ramp up the CRUSH weight a
+ little bit at a time. For example, to ramp by increments of ``0.2``,
+ start with::
+
+ ceph osd crush reweight {osd-id} .2
+
+ and allow migration to complete before reweighting to ``0.4``,
+ ``0.6``, and so on until the desired CRUSH weight is reached.
+
+ To limit the impact of OSD failures, you can set::
+
+ mon osd down out interval = 0
+
+ which prevents down OSDs from automatically being marked out, and then
+ ramp them down manually with::
+
+ ceph osd reweight {osd-num} .8
+
+ Again, wait for the cluster to finish migrating data, and then adjust
+ the weight further until you reach a weight of 0. Note that this
+ problem prevents the cluster to automatically re-replicate data after
+ a failure, so please ensure that sufficient monitoring is in place for
+ an administrator to intervene promptly.
+
+ Note that this practice will no longer be necessary in Bobtail and
+ subsequent releases.
+
+
+Adding an OSD (Chef)
+--------------------
+
+This procedure configures your OSD using ``chef-client``. If your host has
+multiple disks, you may need to execute the procedure for preparing an OSD disk
+for each data disk on your host.
+
+When you add the OSD to the CRUSH map, consider the weight you give to the new
+OSD. Hard disk capacity grows 40% per year, so newer OSD hosts may have larger
+hard disks than older hosts in the cluster.
+
+#. Execute ``chef-client`` to register it with Chef as a Chef node.
+
+#. Edit the node. See `Configure Nodes`_ for details.
+ Change its environment to your Chef environment.
+ Add ``"role[ceph-osd]"`` to the run list.
+
+#. Execute `Prepare OSD Disks`_ for each disk.
+
+#. Execute ``chef-client`` to invoke the run list.
+
+#. Add the OSD to the CRUSH map so that it can begin receiving data. You may
+ also decompile the CRUSH map edit the file, recompile it and set it. See
+ `Add/Move an OSD`_ for details. ::
+
+ ceph osd crush set {id} {name} {weight} pool={pool-name} [{bucket-type}={bucket-name}, ...]
+
+
+Starting the OSD
+----------------
+
+After you add an OSD to Ceph, the OSD is in your configuration. However,
+it is not yet running. The OSD is ``down`` and ``out``. You must start
+your new OSD before it can begin receiving data. You may use
+``service ceph`` from your admin host or start the OSD from its host
+machine::
+
+ service ceph -a start osd.{osd.num}
+ #or alternatively
+ ssh {new-osd-host}
+ sudo /etc/init.d/ceph start osd.{osd-num}
+
+
+Once you start your OSD, it is ``up``.
+
+Put the OSD ``in`` the Cluster
+------------------------------
+
+After you start your OSD, it is ``up`` and ``out``. You need to put it in to
+the cluster so that Ceph can begin writing data to it. ::
+
+ ceph osd in {osd-num}
+
+
+Observe the Data Migration
+--------------------------
+
+Once you have added your new OSD to the CRUSH map, Ceph will begin rebalancing
+the server by migrating placement groups to your new OSD. You can observe this
+process with the `ceph`_ tool. ::
+
+ ceph -w
+
+You should see the placement group states change from ``active+clean`` to
+``active, some degraded objects``, and finally ``active+clean`` when migration
+completes. (Control-c to exit.)
+
+
+.. _Add/Move an OSD: ../crush-map#addosd
+.. _Configure Nodes: ../../config-cluster/chef#confignodes
+.. _Prepare OSD Disks: ../../config-cluster/chef#prepdisks
+.. _ceph: ../monitoring
+
+
+
+Removing OSDs
+=============
+
+When you want to reduce the size of a cluster or replace hardware, you may
+remove an OSD at runtime. With Ceph, an OSD is generally one Ceph ``ceph-osd``
+daemon for one storage disk within a host machine. If your host has multiple
+storage disks, you may need to remove one ``ceph-osd`` daemon for each disk.
+Generally, it's a good idea to check the capacity of your cluster to see if you
+are reaching the upper end of its capacity. Ensure that when you remove an OSD
+that your cluster is not at its ``near full`` ratio.
+
+.. warning:: Do not let your cluster reach its ``full ratio`` when
+ removing an OSD. Removing OSDs could cause the cluster to reach
+ or exceed its ``full ratio``.
+
+
+Take the OSD ``out`` of the Cluster
+-----------------------------------
+
+Before you remove an OSD, it is usually ``up`` and ``in``. You need to take it
+out of the cluster so that Ceph can begin rebalancing and copying its data to
+other OSDs. ::
+
+ ceph osd out {osd-num}
+
+
+Observe the Data Migration
+--------------------------
+
+Once you have taken your OSD ``out`` of the cluster, Ceph will begin
+rebalancing the cluster by migrating placement groups out of the OSD you
+removed. You can observe this process with the `ceph`_ tool. ::
+
+ ceph -w
+
+You should see the placement group states change from ``active+clean`` to
+``active, some degraded objects``, and finally ``active+clean`` when migration
+completes. (Control-c to exit.)
+
+
+Stopping the OSD
+----------------
+
+After you take an OSD out of the cluster, it may still be running.
+That is, the OSD may be ``up`` and ``out``. You must stop
+your OSD before you remove it from the configuration. ::
+
+ ssh {new-osd-host}
+ sudo /etc/init.d/ceph stop osd.{osd-num}
+
+Once you stop your OSD, it is ``down``.
+
+
+Removing an OSD (Manual)
+------------------------
+
+This procedure removes an OSD from a cluster map, removes its authentication
+key, removes the OSD from the OSD map, and removes the OSD from the
+``ceph.conf`` file. If your host has multiple disks, you may need to remove an
+OSD for each disk by repeating this procedure.
+
+
+#. Remove the OSD from the CRUSH map so that it no longer receives data. You may
+ also decompile the CRUSH map, remove the OSD from the device list, remove the
+ device as an item in the host bucket or remove the host bucket (if it's in the
+ CRUSH map and you intend to remove the host), recompile the map and set it.
+ See `Remove an OSD`_ for details. ::
+
+ ceph osd crush remove {name}
+
+#. Remove the OSD authentication key. ::
+
+ ceph auth del osd.{osd-num}
+
+ The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the ``$cluster-$id``.
+ If your cluster name differs from ``ceph``, use your cluster name instead.
+
+#. Remove the OSD. ::
+
+ ceph osd rm {osd-num}
+ #for example
+ ceph osd rm 123
+
+#. Navigate to the host where you keep the master copy of the cluster's
+ ``ceph.conf`` file. ::
+
+ ssh {admin-host}
+ cd /etc/chef
+ vim ceph.conf
+
+#. Remove the OSD entry from your ``ceph.conf`` file. ::
+
+ [osd.123]
+ host = {hostname}
+
+#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file,
+ copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other
+ hosts in your cluster. ::
+
+ ssh {osd} sudo tee /etc/ceph/ceph.conf < /etc/ceph/ceph.conf
+
+.. _Remove an OSD: ../crush-map#removeosd
--- /dev/null
+=====================================
+ Ceph Authentication & Authorization
+=====================================
+
+Ceph is a distributed storage system where a typical deployment involves a
+relatively small quorum of *monitors*, scores of *metadata servers* (MDSs) and
+many thousands of OSD daemons operating across many hosts/nodes--representing
+the server portion of the Ceph object store. Ceph clients such as CephFS, Ceph
+block device and Ceph Gateway interact with the Ceph object store. All Ceph
+object store clients use the ``librados`` library to interact with the Ceph
+object store. The following diagram illustrates an abstract client/server
+technology stack.
+
+.. ditaa:: +---------------------------------------------------+
+ | client |
+ +---------------------------------------------------+
+ | librados |
+ +---------------------------------------------------+
+ +---------------+ +---------------+ +---------------+
+ | OSDs | | MDSs | | Monitors |
+ +---------------+ +---------------+ +---------------+
+
+Users are either individuals or system actors such as applications, which
+use Ceph clients to interact with Ceph server daemons.
+
+.. ditaa:: +-----+
+ | {o} |
+ | |
+ +--+--+ /---------\ /---------\
+ | | Ceph | | Ceph |
+ ---+---*----->| |<------------->| |
+ | uses | Clients | | Servers |
+ | \---------/ \---------/
+ /--+--\
+ | |
+ | |
+ actor
+
+For additional information, see our `Cephx Guide`_ and `ceph-authtool manpage`_.
+
+.. _Cephx Guide: ../authentication
+.. _ceph-authtool manpage: ../../man/8/ceph-authtool/
+
+Ceph Authentication (cephx)
+===========================
+
+Cryptographic authentication has some computational costs, though they should
+generally be quite low. If the network environment connecting your client and
+server hosts is very safe and you cannot afford authentication, you can use a
+Ceph option to turn it off. **This is not generally recommended**, but should you
+need to do so, details can be found in the `Disable Cephx`_ section.
+
+.. important:: Remember, if you disable authentication, you are at risk of a
+ man-in-the-middle attack altering your client/server messages, which could
+ lead to disastrous security effects.
+
+A key scalability feature of Ceph is to avoid a centralized interface to the
+Ceph object store, which means that Ceph clients must be able to interact with
+OSDs directly. To protect data, Ceph provides its ``cephx`` authentication
+system, which authenticates users operating Ceph clients. The ``cephx`` protocol
+operates in a manner with behavior similar to `Kerberos`_.
+
+.. _Disable Cephx: ../authentication#disable-cephx
+.. _Kerberos: http://en.wikipedia.org/wiki/Kerberos_(protocol)
+
+A user/actor invokes a Ceph client to contact a monitor. Unlike Kerberos, each
+monitor can authenticate users and distribute keys, so there is no single point
+of failure or bottleneck when using ``cephx``. The monitor returns an
+authentication data structure similar to a Kerberos ticket that contains a
+session key for use in obtaining Ceph services. This session key is itself
+encrypted with the user's permanent secret key, so that only the user can
+request services from the Ceph monitor(s). The client then uses the session key
+to request its desired services from the monitor, and the monitor provides the
+client with a ticket that will authenticate the client to the OSDs that actually
+handle data. Ceph monitors and OSDs share a secret, so the client can use the
+ticket provided by the monitor with any OSD or metadata server in the cluster.
+Like Kerberos, ``cephx`` tickets expire, so an attacker cannot use an expired
+ticket or session key obtained surreptitiously. This form of authentication will
+prevent attackers with access to the communications medium from either creating
+bogus messages under another user's identity or altering another user's
+legitimate messages, as long as the user's secret key is not divulged before it
+expires.
+
+To use ``cephx``, an administrator must set up users first. In the following
+diagram, the ``client.admin`` user invokes ``ceph auth get-or-create-key`` from
+the command line to generate a username and secret key. Ceph's ``auth``
+subsystem generates the username and key, stores a copy with the monitor(s) and
+transmits the user's secret back to the ``client.admin`` user. This means that
+the client and the monitor share a secret key.
+
+.. note:: The ``client.admin`` user must provide the user ID and
+ secret key to the user in a secure manner.
+
+.. ditaa:: +---------+ +---------+
+ | Client | | Monitor |
+ +---------+ +---------+
+ | request to |
+ | create a user |
+ |-------------->|----------+ create user
+ | | | and
+ |<--------------|<---------+ store key
+ | transmit key |
+ | |
+
+
+To authenticate with the monitor, the client passes in the user name to the
+monitor, and the monitor generates a session key and encrypts it with the secret
+key associated to the user name. Then, the monitor transmits the encrypted
+ticket back to the client. The client then decrypts the payload with the shared
+secret key to retrieve the session key. The session key identifies the user for
+the current session. The client then requests a ticket on behalf of the user
+signed by the session key. The monitor generates a ticket, encrypts it with the
+user's secret key and transmits it back to the client. The client decrypts the
+ticket and uses it to sign requests to OSDs and metadata servers throughout the
+cluster.
+
+.. ditaa:: +---------+ +---------+
+ | Client | | Monitor |
+ +---------+ +---------+
+ | authenticate |
+ |-------------->|----------+ generate and
+ | | | encrypt
+ |<--------------|<---------+ session key
+ | transmit |
+ | encrypted |
+ | session key |
+ | |
+ |-----+ decrypt |
+ | | session |
+ |<----+ key |
+ | |
+ | req. ticket |
+ |-------------->|----------+ generate and
+ | | | encrypt
+ |<--------------|<---------+ ticket
+ | recv. ticket |
+ | |
+ |-----+ decrypt |
+ | | ticket |
+ |<----+ |
+
+
+The ``cephx`` protocol authenticates ongoing communications between the client
+machine and the Ceph servers. Each message sent between a client and server,
+subsequent to the initial authentication, is signed using a ticket that the
+monitors, OSDs and metadata servers can verify with their shared secret.
+
+.. ditaa:: +---------+ +---------+ +-------+ +-------+
+ | Client | | Monitor | | MDS | | OSD |
+ +---------+ +---------+ +-------+ +-------+
+ | request to | | |
+ | create a user | | |
+ |-------------->| mon and | |
+ |<--------------| client share | |
+ | receive | a secret. | |
+ | shared secret | | |
+ | |<------------>| |
+ | |<-------------+------------>|
+ | | mon, mds, | |
+ | authenticate | and osd | |
+ |-------------->| share | |
+ |<--------------| a secret | |
+ | session key | | |
+ | | | |
+ | req. ticket | | |
+ |-------------->| | |
+ |<--------------| | |
+ | recv. ticket | | |
+ | | | |
+ | make request (CephFS only) | |
+ |----------------------------->| |
+ |<-----------------------------| |
+ | receive response (CephFS only) |
+ | |
+ | make request |
+ |------------------------------------------->|
+ |<-------------------------------------------|
+ receive response
+
+The protection offered by this authentication is between the Ceph client and the
+Ceph server hosts. The authentication is not extended beyond the Ceph client. If
+the user accesses the Ceph client from a remote host, Ceph authentication is not
+applied to the connection between the user's host and the client host.
+
+
+Ceph Authorization (caps)
+=========================
+
+Ceph uses the term "capabilities" (caps) to describe authorizing an
+authenticated user to exercise the functionality of the monitors, OSDs and
+metadata servers. Capabilities can also restrict access to data within one or
+more pools.
+
+.. note:: Ceph uses the capabilities discussed here for setting up and
+ controlling access between various Ceph client and server instances, and
+ are relevant regardless of what type of client accesses the Ceph object
+ store. CephFS uses a different type of capability for files and directories
+ internal to the CephFS filesystem. CephFS filesystem access controls are
+ relevant to CephFS, but not block devices or the RESTful gateway.
+
+A Ceph ``client.admin`` user sets a user's capabilities when creating
+the user.
+
+
+``allow``
+
+:Description: Precedes access settings for a daemon. Implies ``rw`` for MDS only.
+:Example: ``ceph-authtool -n client.foo --cap mds 'allow'``
+
+
+``r``
+
+:Description: Gives the user read access. Required with monitors to retrieve the CRUSH map.
+:Example: ``ceph-authtool -n client.foo --cap mon 'allow r'``
+
+
+``w``
+
+:Description: Gives the user write access to objects.
+:Example: ``ceph-authtool -n client.foo --cap osd 'allow w'``
+
+
+``x``
+
+:Description: Gives the user the capability to call class methods (i.e., both read and write).
+:Example: ``ceph-authtool -n client.foo --cap osd 'allow x'``
+
+
+``class-read``
+
+:Descriptions: Gives the user the capability to call class read methods. Subset of ``x``.
+:Example: ``ceph-authtool -n client.foo --cap osd 'allow class-read'``
+
+
+``class-write``
+
+:Description: Gives the user the capability to call class write methods. Subset of ``x``.
+:Example: ``ceph-authtool -n client.foo --cap osd 'allow class-write'``
+
+
+``*``
+
+:Description: Gives the user read, write and execute permissions for a particular daemon/pool, and the ability to execute admin commands.
+:Example: ``ceph-authtool -n client.foo --cap osd 'allow *'``
+
+
+When setting capabilities for a user, Ceph also supports restricting the
+capabilities to a particular pool. This means you can have full access to some
+pools, and restricted (or no) access to other pools for the same user.
+For example::
+
+ ceph-authtool -n client.foo --cap osd 'allow rwx' pool=customer-pool
+
+
+
+Cephx Limitations
+=================
+
+The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
+is not intended to handle authentication of human users or application programs
+run on their behalf. If that effect is required to handle your access control
+needs, you must have another mechanism, which is likely to be specific to the
+front end used to access the Ceph object store. This other mechanism has the
+role of ensuring that only acceptable users and programs are able to run on the
+machine that Ceph will permit to access its object store.
+
+The keys used to authenticate Ceph clients and servers are typically stored in
+a plain text file with appropriate permissions in a trusted host.
+
+.. important:: Storing keys in plaintext files has security shortcomings, but
+ they are difficult to avoid, given the basic authentication methods Ceph
+ uses in the background. Those setting up Ceph systems should be aware of
+ these shortcomings.
+
+In particular, arbitrary user machines, especially portable machines, should not
+be configured to interact directly with Ceph, since that mode of use would
+require the storage of a plaintext authentication key on an insecure machine.
+Anyone who stole that machine or obtained surreptitious access to it could
+obtain the key that will allow them to authenticate their own machines to Ceph.
+
+Rather than permitting potentially insecure machines to access a Ceph object
+store directly, users should be required to sign in to a trusted machine in
+your environment using a method that provides sufficient security for your
+purposes. That trusted machine will store the plaintext Ceph keys for the
+human users. A future version of Ceph may address these particular
+authentication issues more fully.
+
+At the moment, none of the Ceph authentication protocols provide secrecy for
+messages in transit. Thus, an eavesdropper on the wire can hear and understand
+all data sent between clients and servers in Ceph, even if he cannot create or
+alter them. Further, Ceph does not include options to encrypt user data in the
+object store. Users can hand-encrypt and store their own data in the Ceph
+object store, of course, but Ceph provides no features to perform object
+encryption itself. Those storing sensitive data in Ceph should consider
+encrypting their data before providing it to the Ceph system.
--- /dev/null
+=============
+ Cephx Guide
+=============
+
+Ceph provides two authentication modes:
+
+- **None:** Any user can access data without authentication.
+- **Cephx**: Ceph requires user authentication in a manner similar to Kerberos.
+
+If you disable ``cephx``, you do not need to generate keys using the procedures
+described here. If you re-enable ``cephx`` and have already generated keys, you
+do not need to generate the keys again.
+
+.. important: The ``cephx`` protocol does not address data encryption in transport
+ (e.g., SSL/TLS) or encryption at rest.
+
+For additional information, see our `Cephx Intro`_ and `ceph-authtool manpage`_.
+
+.. _Cephx Intro: ../auth-intro
+.. _ceph-authtool manpage: ../../man/8/ceph-authtool/
+
+
+Configuring Cephx
+=================
+
+There are several important procedures you must follow to enable the ``cephx``
+protocol for your Ceph cluster and its daemons. First, you must generate a
+secret key for the default ``client.admin`` user so the administrator can
+execute Ceph commands. Second, you must generate a monitor secret key and
+distribute it to all monitors in the cluster. Finally, you can follow the
+remaining steps in `Enabling Cephx`_ to enable authentication.
+
+.. _client-admin-key:
+
+The ``client.admin`` Key
+------------------------
+
+When you first install Ceph, each Ceph command you execute on the command line
+assumes that you are the ``client.admin`` default user. When running Ceph with
+``cephx`` enabled, you need to have a key for the ``client.admin`` user to run
+``ceph`` commands as the administrator.
+
+.. important: To run Ceph commands on the command line with
+ ``cephx`` enabled, you need to create a key for the ``client.admin``
+ user, and create a secret file under ``/etc/ceph``.
+
+The following command will generate and register a ``client.admin``
+key on the monitor with admin capabilities and write it to a keyring
+on the local file system. If the key already exists, its current
+value will be returned. ::
+
+ sudo ceph auth get-or-create client.admin mds 'allow' osd 'allow *' mon 'allow *' > /etc/ceph/keyring
+
+See `Enabling Cephx`_ step 1 for stepwise details to enable ``cephx``.
+
+
+Monitor Keyrings
+----------------
+
+Ceph requires a keyring for the monitors. Use the `ceph-authtool`_ command to
+generate a secret monitor key and keyring. ::
+
+ sudo ceph-authtool {keyring} --create-keyring --gen-key -n mon.
+
+A cluster with multiple monitors must have identical keyrings for all
+monitors. So you must copy the keyring to each monitor host under the
+following directory::
+
+ /var/lib/ceph/mon/$cluster-$id
+
+See `Enabling Cephx`_ step 2 and 3 for stepwise details to enable ``cephx``.
+
+.. _ceph-authtool: ../../man/8/ceph-authtool/
+
+
+.. _enable-cephx:
+
+Enabling Cephx
+--------------
+
+When ``cephx`` is enabled, Ceph will look for the keyring in the default search
+path, which includes ``/etc/ceph/keyring``. You can override this location by
+adding a ``keyring`` option in the ``[global]`` section of your `Ceph
+configuration`_ file, but this is not recommended.
+
+Execute the following procedures to enable ``cephx`` on a cluster with ``cephx``
+disabled. If you (or your deployment utility) have already generated the keys,
+you may skip the steps related to generating keys.
+
+#. Create a ``client.admin`` key, and save a copy of the key for your client host::
+
+ ceph auth get-or-create client.admin mon 'allow *' mds 'allow *' osd 'allow *' -o /etc/ceph/keyring
+
+ **Warning:** This will clobber any existing ``/etc/ceph/keyring`` file. Be careful!
+
+#. Generate a secret monitor ``mon.`` key::
+
+ ceph-authtool --create --gen-key -n mon. /tmp/monitor-key
+
+#. Copy the mon keyring into a ``keyring`` file in every monitor's ``mon data`` directory::
+
+ cp /tmp/monitor-key /var/lib/ceph/mon/ceph-a/keyring
+
+#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number::
+
+ ceph auth get-or-create osd.{$id} mon 'allow rwx' osd 'allow *' -o /var/lib/ceph/osd/ceph-{$id}/keyring
+
+#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter::
+
+ ceph auth get-or-create mds.{$id} mon 'allow rwx' osd 'allow *' mds 'allow *' -o /var/lib/ceph/mds/ceph-{$id}/keyring
+
+#. Enable ``cephx`` authentication for versions ``0.51`` and above by setting
+ the following options in the ``[global]`` section of your `Ceph configuration`_
+ file::
+
+ auth cluster required = cephx
+ auth service required = cephx
+ auth client required = cephx
+
+#. Or, enable ``cephx`` authentication for versions ``0.50`` and below by
+ setting the following option in the ``[global]`` section of your `Ceph
+ configuration`_ file::
+
+ auth supported = cephx
+
+.. deprecated:: 0.51
+
+#. Start or restart the Ceph cluster. ::
+
+ sudo service ceph -a start
+ sudo service ceph -a restart
+
+.. _disable-cephx:
+
+Disabling Cephx
+---------------
+
+The following procedure describes how to disable Cephx. If your cluster
+environment is relatively safe, you can offset the computation expense of
+running authentication. **We do not recommend it.** However, it may be
+easier during setup and/or troubleshooting to temporarily disable authentication.
+
+#. Disable ``cephx`` authentication for versions ``0.51`` and above by setting
+ the following options in the ``[global]`` section of your `Ceph configuration`_
+ file::
+
+ auth cluster required = none
+ auth service required = none
+ auth client required = none
+
+#. Or, disable ``cephx`` authentication for versions ``0.50`` and below
+ (deprecated as of version 0.51) by setting the following option in the
+ ``[global]`` section of your `Ceph configuration`_ file::
+
+ auth supported = none
+
+#. Start or restart the Ceph cluster. ::
+
+ sudo service ceph -a start
+ sudo service ceph -a restart
+
+
+Daemon Keyrings
+---------------
+
+With the exception of the monitors, daemon keyrings are generated in
+the same way that user keyrings are. By default, the daemons store
+their keyrings inside their data directory. The default keyring
+locations, and the capabilities necessary for the daemon to function,
+are shown below.
+
+``ceph-mon``
+
+:Location: ``$mon_data/keyring``
+:Capabilities: N/A
+
+``ceph-osd``
+
+:Location: ``$osd_data/keyring``
+:Capabilities: ``mon 'allow rwx' osd 'allow *'``
+
+``ceph-mds``
+
+:Location: ``$mds_data/keyring``
+:Capabilities: ``mds 'allow rwx' mds 'allow *' osd 'allow *'``
+
+``radosgw``
+
+:Location: ``$rgw_data/keyring``
+:Capabilities: ``mon 'allow r' osd 'allow rwx'``
+
+
+Note that the monitor keyring contains a key but no capabilities, and
+is not part of the cluster ``auth`` database.
+
+The daemon data directory locations default to directories of the form::
+
+ /var/lib/ceph/$type/$cluster-$id
+
+For example, ``osd.12`` would be::
+
+ /var/lib/ceph/osd/ceph-12
+
+You can override these locations, but it is not recommended.
+
+Cephx Administration
+====================
+
+Cephx uses shared secret keys for authentication, meaning both the client and
+the monitor cluster have a copy of the client's secret key. The authentication
+protocol is such that both parties are able to prove to each other they have a
+copy of the key without actually revealing it. This provides mutual
+authentication, which means the cluster is sure the user possesses the secret
+key, and the user is sure that the cluster has a copy of the secret key.
+
+Default users and pools are suitable for initial testing purposes. For test bed
+and production environments, you should create users and assign pool access to
+the users.
+
+.. _add-a-key:
+
+Add a Key
+---------
+
+Keys enable a specific user to access the monitor, metadata server and
+cluster according to capabilities assigned to the key. Capabilities are
+simple strings specifying some access permissions for a given server type.
+Each server type has its own string. All capabilities are simply listed
+in ``{type}`` and ``{capability}`` pairs on the command line::
+
+ sudo ceph auth get-or-create-key client.{username} {daemon1} {cap1} {daemon2} {cap2} ...
+
+For example, to create a user ``client.foo`` with access 'rw' for
+daemon type 'osd' and 'r' for daemon type 'mon'::
+
+ sudo ceph auth get-or-create-key client.foo osd rw mon r > keyring.foo
+
+.. note: User names are associated to user types, which include ``client``
+ ``admin``, ``osd``, ``mon``, and ``mds``. In most cases, you will be
+ creating keys for ``client`` users.
+
+.. _auth-delete-key:
+
+Delete a Key
+------------
+
+To delete a key for a user or a daemon, use ``ceph auth del``::
+
+ ceph auth del {daemon-type}.{ID|username}
+
+Where ``{daemon-type}`` is one of ``client``, ``osd``, ``mon``, or ``mds``,
+and ``{ID|username}`` is the ID of the daemon or the username.
+
+List Keys in your Cluster
+-------------------------
+
+To list the keys registered in your cluster::
+
+ sudo ceph auth list
+
+
+Cephx Commandline Options
+=========================
+
+When Ceph runs with Cephx enabled, you must specify a user name and a secret key
+on the command line. Alternatively, you may use the ``CEPH_ARGS`` environment
+variable to avoid re-entry of the user name and secret. ::
+
+ ceph --id {user-name} --keyring=/path/to/secret [commands]
+
+For example::
+
+ ceph --id client.admin --keyring=/etc/ceph/ceph.keyring [commands]
+
+
+Ceph supports the following usage for user name and secret:
+
+``--id`` | ``--user``
+
+:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
+ ``client.admin``, ``client.user1``). The ``id``, ``name`` and
+ ``-n`` options enable you to specify the ID portion of the user
+ name (e.g., ``admin``, ``user1``, ``foo``, etc.). You can specify
+ the user with the ``--id`` and omit the type. For example,
+ to specify user ``client.foo`` enter the following::
+
+ ceph --id foo --keyring /path/to/keyring health
+ ceph --user foo --keyring /path/to/keyring health
+
+
+``--name``
+
+:Description: Ceph identifies users with a type and an ID (e.g., ``TYPE.ID`` or
+ ``client.admin``, ``client.user1``). The ``--name`` and ``-n``
+ options enables you to specify the fully qualified user name.
+ You must specify the user type (typically ``client``) with the
+ user ID. For example::
+
+ ceph --name client.foo --keyring /path/to/keyring health
+ ceph -n client.foo --keyring /path/to/keyring health
+
+
+
+``--keyring``
+
+:Description: The path to the keyring containing one or more user name and
+ secret. The ``--secret`` option provides the same functionality,
+ but it does not work with Ceph RADOS Gateway, which uses
+ ``--secret`` for another purpose. You may retrieve a keyring with
+ ``ceph auth get-or-create`` and store it locally. This is a
+ preferred approach, because you can switch user names without
+ switching the keyring path. For example::
+
+ sudo rbd map foo --pool rbd myimage --id client.foo --keyring /path/to/keyring
+
+
+``--keyfile``
+
+:Description: The path to the key file containing the secret key for the user
+ specified by ``--id``, ``--name``, ``-n``, or ``--user``. You may
+ retrieve the key for a specific user with ``ceph auth get`` and
+ store it locally. Then, specify the path to the keyfile.
+ For example::
+
+ sudo rbd map foo --pool rbd myimage --id client.foo --keyfile /path/to/file
+
+
+.. note:: Add the user and secret to the ``CEPH_ARGS`` environment variable so that
+ you don’t need to enter them each time. You can override the environment
+ variable settings on the command line.
+
+
+Backward Compatibility
+======================
+
+.. versionadded:: Bobtail
+
+In Ceph Argonaut v0.48 and earlier versions, if you enable ``cephx``
+authentication, Ceph only authenticates the initial communication between the
+client and daemon; Ceph does not authenticate the subsequent messages they send
+to each other, which has security implications. In Ceph Bobtail and subsequent
+versions, Ceph authenticates all ongoing messages between the entities using the
+session key set up for that initial authentication.
+
+We identified a backward compatibility issue between Argonaut v0.48 (and prior
+versions) and Bobtail (and subsequent versions). During testing, if you
+attempted to use Argonaut (and earlier) daemons with Bobtail (and later)
+daemons, the Argonaut daemons did not know how to perform ongoing message
+authentication, while the Bobtail versions of the daemons insist on
+authenticating message traffic subsequent to the initial
+request/response--making it impossible for Argonaut (and prior) daemons to
+interoperate with Bobtail (and subsequent) daemons.
+
+We have addressed this potential problem by providing a means for Argonaut (and
+prior) systems to interact with Bobtail (and subsequent) systems. Here's how it
+works: by default, the newer systems will not insist on seeing signatures from
+older systems that do not know how to perform them, but will simply accept such
+messages without authenticating them. This new default behavior provides the
+advantage of allowing two different releases to interact. **We do not recommend
+this as a long term solution**. Allowing newer daemons to forgo ongoing
+authentication has the unfortunate security effect that an attacker with control
+of some of your machines or some access to your network can disable session
+security simply by claiming to be unable to sign messages.
+
+.. note:: Even if you don't actually run any old versions of Ceph,
+ the attacker may be able to force some messages to be accepted unsigned in the
+ default scenario. While running Cephx with the default scenario, Ceph still
+ authenticates the initial communication, but you lose desirable session security.
+
+If you know that you are not running older versions of Ceph, or you are willing
+to accept that old servers and new servers will not be able to interoperate, you
+can eliminate this security risk. If you do so, any Ceph system that is new
+enough to support session authentication and that has Cephx enabled will reject
+unsigned messages. To preclude new servers from interacting with old servers,
+include the following line into the ``[global]`` section of your `Ceph
+configuration`_ file directly below the line that specifies the use of Cephx
+for authentication::
+
+ cephx require signatures = true
+
+**We recommend migrating all daemons to the newer versions and enabling the
+foregoing flag** at the nearest practical time so that you may avail yourself
+of the enhanced authentication.
+
+.. _Ceph configuration: ../../config-cluster/ceph-conf
--- /dev/null
+.. index:: control, commands
+
+==================
+ Control Commands
+==================
+
+
+Monitor Commands
+================
+
+Monitor commands are issued using the ceph utility::
+
+ ceph [-m monhost] {command}
+
+The command is usually (though not always) of the form::
+
+ ceph {subsystem} {command}
+
+
+System Commands
+===============
+
+Execute the following to display the current status of the cluster. ::
+
+ ceph -s
+ ceph status
+
+Execute the following to display a running summary of the status of the cluster,
+and major events. ::
+
+ ceph -w
+
+Execute the following to show the monitor quorum, including which monitors are
+participating and which one is the leader. ::
+
+ ceph quorum_status
+
+Execute the following to query the status of a single monitor, including whether
+or not it is in the quorum. ::
+
+ ceph [-m monhost] mon_status
+
+
+Authentication Subsystem
+========================
+
+To add a keyring for an OSD, execute the following::
+
+ ceph auth add {osd} {--in-file|-i} {path-to-osd-keyring}
+
+To list the cluster's keys and their capabilities, execute the following::
+
+ ceph auth list
+
+
+Placement Group Subsystem
+=========================
+
+To display the statistics for all placement groups, execute the following::
+
+ ceph -- pg dump [--format {format}]
+
+The valid formats are ``plain`` (default) and ``json``.
+
+To display the statistics for all placement groups stuck in a specified state,
+execute the following::
+
+ ceph -- pg dump_stuck inactive|unclean|stale [--format {format}] [-t|--threshold {seconds}]
+
+
+``--format`` may be ``plain`` (default) or ``json``
+
+``--threshold`` defines how many seconds "stuck" is (default: 300)
+
+**Inactive** Placement groups cannot process reads or writes because they are waiting for an OSD
+with the most up-to-date data to come back.
+
+**Unclean** Placement groups contain objects that are not replicated the desired number
+of times. They should be recovering.
+
+**Stale** Placement groups are in an unknown state - the OSDs that host them have not
+reported to the monitor cluster in a while (configured by
+``mon_osd_report_timeout``).
+
+Revert "lost" objects to their prior state, either a previous version
+or delete them if they were just created. ::
+
+ ceph pg {pgid} mark_unfound_lost revert
+
+
+OSD Subsystem
+=============
+
+Query osd subsystem status. ::
+
+ ceph osd stat
+
+Write a copy of the most recent osd map to a file. See
+`osdmaptool`_. ::
+
+ ceph osd getmap -o file
+
+.. _osdmaptool: ../../man/8/osdmaptool
+
+Write a copy of the crush map from the most recent osd map to
+file. ::
+
+ ceph osd getcrushmap -o file
+
+The foregoing functionally equivalent to ::
+
+ ceph osd getmap -o /tmp/osdmap
+ osdmaptool /tmp/osdmap --export-crush file
+
+Dump the OSD map. Valid formats for ``-f`` are ``plain`` and ``json``. If no
+``--format`` option is given, the OSD map is dumped as plain text. ::
+
+ ceph osd dump [--format {format}]
+
+Dump the OSD map as a tree with one line per OSD containing weight
+and state. ::
+
+ ceph osd tree [--format {format}]
+
+Find out where a specific object is or would be stored in the system::
+
+ ceph osd map <pool-name> <object-name>
+
+Add or move a new item (OSD) with the given id/name/weight at the specified
+location. ::
+
+ ceph osd crush set {id} {weight} [{loc1} [{loc2} ...]]
+
+Remove an existing item from the CRUSH map. ::
+
+ ceph osd crush remove {id}
+
+Move an existing bucket from one position in the hierarchy to another. ::
+
+ ceph osd crush move {id} {loc1} [{loc2} ...]
+
+Set the weight of the item given by ``{name}`` to ``{weight}``. ::
+
+ ceph osd crush reweight {name} {weight}
+
+Create a cluster snapshot. ::
+
+ ceph osd cluster_snap {name}
+
+Mark an OSD as lost. This may result in permanent data loss. Use with caution. ::
+
+ ceph osd lost [--yes-i-really-mean-it]
+
+Create a new OSD. If no ID is given, a new ID is automatically selected
+if possible. ::
+
+ ceph osd create [{id}]
+
+Remove the given OSD(s). ::
+
+ ceph osd rm [{id}...]
+
+Query the current max_osd parameter in the osd map. ::
+
+ ceph osd getmaxosd
+
+Import the given OSD map. Note that this can be a bit dangerous,
+since the OSD map includes dynamic state about which OSDs are current
+on or offline; only do this if you've just modified a (very) recent
+copy of the map. ::
+
+ ceph osd setmap -i file
+
+Import the given crush map. ::
+
+ ceph osd setcrushmap -i file
+
+Set the ``max_osd`` parameter in the OSD map. This is necessary when
+expanding the storage cluster. ::
+
+ ceph osd setmaxosd
+
+Mark OSD ``{osd-num}`` down. ::
+
+ ceph osd down {osd-num}
+
+Mark OSD ``{osd-num}`` out of the distribution (i.e. allocated no data). ::
+
+ ceph osd out {osd-num}
+
+Mark ``{osd-num}`` in the distribution (i.e. allocated data). ::
+
+ ceph osd in {osd-num}
+
+List classes that are loaded in the ceph cluster. ::
+
+ ceph class list
+
+Set or clear the pause flags in the OSD map. If set, no IO requests
+will be sent to any OSD. Clearing the flags via unpause results in
+resending pending requests. ::
+
+ ceph osd pause
+ ceph osd unpause
+
+Set the weight of ``{osd-num}`` to ``{weight}``. Two OSDs with the same weight will receive
+roughly the same number of I/O requests and store approximately the
+same amount of data. ::
+
+ ceph osd reweight {osd-num} {weight}
+
+Reweights all the OSDs by reducing the weight of OSDs which are
+heavily overused. By default it will adjust the weights downward on
+OSDs which have 120% of the average utilization, but if you include
+threshold it will use that percentage instead. ::
+
+ ceph osd reweight-by-utilization [threshold]
+
+Adds/removes the address to/from the blacklist. When adding an address,
+you can specify how long it should be blacklisted in seconds; otherwise,
+it will default to 1 hour. A blacklisted address is prevented from
+connecting to any OSD. Blacklisting is most often used to prevent a
+lagging metadata server from making bad changes to data on the OSDs.
+
+These commands are mostly only useful for failure testing, as
+blacklists are normally maintained automatically and shouldn't need
+manual intervention. ::
+
+ ceph osd blacklist add ADDRESS[:source_port] [TIME]
+ ceph osd blacklist rm ADDRESS[:source_port]
+
+Creates/deletes a snapshot of a pool. ::
+
+ ceph osd pool mksnap {pool-name} {snap-name}
+ ceph osd pool rmsnap {pool-name} {snap-name}
+
+Creates/deletes/renames a storage pool. ::
+
+ ceph osd pool create {pool-name} pg_num [pgp_num]
+ ceph osd pool delete {pool-name}
+ ceph osd pool rename {old-name} {new-name}
+
+Changes a pool setting. ::
+
+ ceph osd pool set {pool-name} {field} {value}
+
+Valid fields are:
+
+ * ``size``: Sets the number of copies of data in the pool.
+ * ``crash_replay_interval``: The number of seconds to allow
+ clients to replay acknowledged but uncommited requests.
+ * ``pg_num``: The placement group number.
+ * ``pgp_num``: Effective number when calculating pg placement.
+ * ``crush_ruleset``: rule number for mapping placement.
+
+Get the value of a pool setting. ::
+
+ ceph osd pool get {pool-name} {field}
+
+Valid fields are:
+
+ * ``pg_num``: The placement group number.
+ * ``pgp_num``: Effective number of placement groups when calculating placement.
+ * ``lpg_num``: The number of local placement groups.
+ * ``lpgp_num``: The number used for placing the local placement groups.
+
+
+Sends a scrub command to OSD ``{osd-num}``. To send the command to all OSDs, use ``*``. ::
+
+ ceph osd scrub {osd-num}
+
+Sends a repair command to osdN. To send the command to all osds, use ``*``. ::
+
+ ceph osd repair N
+
+Runs a simple throughput benchmark against osdN, writing ``TOTAL_BYTES``
+in write requests of ``BYTES_PER_WRITE`` each. By default, the test
+writes 1 GB in total in 4-MB increments. ::
+
+ ceph osd tell N bench [BYTES_PER_WRITE] [TOTAL_BYTES]
+
+
+MDS Subsystem
+=============
+
+Change configuration parameters on a running mds. ::
+
+ ceph mds tell {mds-id} injectargs '--{switch} {value} [--{switch} {value}]'
+
+Example::
+
+ ceph mds tell 0 injectargs '--debug_ms 1 --debug_mds 10'
+
+Enables debug messages. ::
+
+ ceph mds stat
+
+Displays the status of all metadata servers.
+
+.. todo:: ``ceph mds`` subcommands missing docs: set_max_mds, dump, getmap, stop, setmap
+
+
+Mon Subsystem
+=============
+
+Show monitor stats::
+
+ ceph mon stat
+
+ 2011-12-14 10:40:59.044395 mon {- [mon,stat]
+ 2011-12-14 10:40:59.057111 mon.1 -} 'e3: 5 mons at {a=10.1.2.3:6789/0,b=10.1.2.4:6789/0,c=10.1.2.5:6789/0,d=10.1.2.6:6789/0,e=10.1.2.7:6789/0}, election epoch 16, quorum 0,1,2,3' (0)
+
+The ``quorum`` list at the end lists monitor nodes that are part of the current quorum.
+
+This is also available more directly::
+
+ $ ./ceph quorum_status
+
+ 2011-12-14 10:44:20.417705 mon {- [quorum_status]
+ 2011-12-14 10:44:20.431890 mon.0 -}
+
+.. code-block:: javascript
+
+ '{ "election_epoch": 10,
+ "quorum": [
+ 0,
+ 1,
+ 2],
+ "monmap": { "epoch": 1,
+ "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
+ "modified": "2011-12-12 13:28:27.505520",
+ "created": "2011-12-12 13:28:27.505520",
+ "mons": [
+ { "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:6789\/0"},
+ { "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:6790\/0"},
+ { "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:6791\/0"}]}}' (0)
+
+The above will block until a quorum is reached.
+
+For a status of just the monitor you connect to (use ``-m HOST:PORT``
+to select)::
+
+ ceph mon_status
+
+
+ 2011-12-14 10:45:30.644414 mon {- [mon_status]
+ 2011-12-14 10:45:30.644632 mon.0 -}
+
+.. code-block:: javascript
+
+ '{ "name": "a",
+ "rank": 0,
+ "state": "leader",
+ "election_epoch": 10,
+ "quorum": [
+ 0,
+ 1,
+ 2],
+ "outside_quorum": [],
+ "monmap": { "epoch": 1,
+ "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
+ "modified": "2011-12-12 13:28:27.505520",
+ "created": "2011-12-12 13:28:27.505520",
+ "mons": [
+ { "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:6789\/0"},
+ { "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:6790\/0"},
+ { "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:6791\/0"}]}}' (0)
+
+A dump of the monitor state::
+
+ ceph mon dump
+
+ 2011-12-14 10:43:08.015333 mon {- [mon,dump]
+ 2011-12-14 10:43:08.015567 mon.0 -} 'dumped monmap epoch 1' (0)
+ epoch 1
+ fsid 444b489c-4f16-4b75-83f0-cb8097468898
+ last_changed 2011-12-12 13:28:27.505520
+ created 2011-12-12 13:28:27.505520
+ 0: 127.0.0.1:6789/0 mon.a
+ 1: 127.0.0.1:6790/0 mon.b
+ 2: 127.0.0.1:6791/0 mon.c
+
--- /dev/null
+============
+ CRUSH Maps
+============
+
+The :abbr:`CRUSH (Controlled Replication Under Scalable Hashing)` algorithm
+determines how to store and retrieve data by computing data storage locations.
+CRUSH empowers Ceph clients to communicate with OSDs directly rather than
+through a centralized server or broker. With an algorithmically determined
+method of storing and retrieving data, Ceph avoids a single point of failure, a
+performance bottleneck, and a physical limit to its scalability.
+
+CRUSH requires a map of your cluster, and uses the CRUSH map to pseudo-randomly
+store and retrieve data in OSDs with a uniform distribution of data across the
+cluster. For a detailed discussion of CRUSH, see
+`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_
+
+.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: http://ceph.com/papers/weil-crush-sc06.pdf
+
+CRUSH Maps contain a list of :abbr:`OSDs (Object Storage Devices)`, a list of
+'buckets' for aggregating the devices into physical locations, and a list of
+rules that tell CRUSH how it should replicate data in a Ceph cluster's pools. By
+reflecting the underlying physical organization of the installation, CRUSH can
+model—and thereby address—potential sources of correlated device failures.
+Typical sources include physical proximity, a shared power source, and a shared
+network. By encoding this information into the cluster map, CRUSH placement
+policies can separate object replicas across different failure domains while
+still maintaining the desired distribution. For example, to address the
+possibility of concurrent failures, it may be desirable to ensure that data
+replicas are on devices in different shelves, racks, power supplies,
+controllers, and/or physical locations.
+
+When you create a configuration file and deploy Ceph with ``mkcephfs``, Ceph
+generates a default CRUSH map for your configuration. The default CRUSH map is
+fine for your Ceph sandbox environment. However, when you deploy a large-scale
+data cluster, you should give significant consideration to developing a custom
+CRUSH map, because it will help you manage your Ceph cluster, improve
+performance and ensure data safety.
+
+For example, if an OSD goes down, a CRUSH Map can help you can locate
+the physical data center, room, row and rack of the host with the failed OSD in
+the event you need to use onsite support or replace hardware.
+
+Similarly, CRUSH may help you identify faults more quickly. For example, if all
+OSDs in a particular rack go down simultaneously, the fault may lie with a
+network switch or power to the rack or the network switch rather than the
+OSDs themselves.
+
+A custom CRUSH map can also help you identify the physical locations where
+Ceph stores redundant copies of data when the placement group(s) associated
+with a failed host are in a degraded state.
+
+`Inktank`_ provides excellent premium support for developing CRUSH maps.
+
+.. _Inktank: http://www.inktank.com
+
+.. note:: Lines of code in example boxes may extend past the edge of the box.
+ Please scroll when reading or copying longer examples.
+
+Editing a CRUSH Map
+===================
+
+To edit an existing CRUSH map:
+
+#. `Get the CRUSH Map`_.
+#. `Decompile`_ the CRUSH Map.
+#. Edit at least one of `Devices`_, `Buckets`_ and `Rules`_.
+#. `Recompile`_ the CRUSH Map.
+#. `Set the CRUSH Map`_.
+
+To activate CRUSH Map rules for a specific pool, identify the common ruleset
+number for those rules and specify that ruleset number for the pool. See `Set
+Pool Values`_ for details.
+
+.. _Get the CRUSH Map: #getcrushmap
+.. _Decompile: #decompilecrushmap
+.. _Devices: #crushmapdevices
+.. _Buckets: #crushmapbuckets
+.. _Rules: #crushmaprules
+.. _Recompile: #compilecrushmap
+.. _Set the CRUSH Map: #setcrushmap
+.. _Set Pool Values: ../pools#setpoolvalues
+
+.. _getcrushmap:
+
+Get a CRUSH Map
+---------------
+
+To get the CRUSH Map for your cluster, execute the following::
+
+ ceph osd getcrushmap -o {compiled-crushmap-filename}
+
+Ceph will output (-o) a compiled CRUSH Map to the filename you specified. Since
+the CRUSH Map is in a compiled form, you must decompile it first before you can
+edit it.
+
+.. _decompilecrushmap:
+
+Decompile a CRUSH Map
+---------------------
+
+To decompile a CRUSH Map, execute the following::
+
+ crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
+
+Ceph will decompile (-d) the compiled CRUSH map and output (-o) it to the
+filename you specified.
+
+
+.. _compilecrushmap:
+
+Compile a CRUSH Map
+-------------------
+
+To compile a CRUSH Map, execute the following::
+
+ crushtool -c {decompiled-crush-map-filename} -o {compiled-crush-map-filename}
+
+Ceph will store a compiled CRUSH map to the filename you specified.
+
+
+.. _setcrushmap:
+
+Set a CRUSH Map
+---------------
+
+To set the CRUSH Map for your cluster, execute the following::
+
+ ceph osd setcrushmap -i {compiled-crushmap-filename}
+
+Ceph will input the compiled CRUSH Map of the filename you specified as the
+CRUSH Map for the cluster.
+
+
+
+CRUSH Map Parameters
+====================
+
+There are three main sections to a CRUSH Map.
+
+#. Devices consist of any object storage device--i.e., the hard disk
+ corresponding to a ``ceph-osd`` daemon.
+#. Buckets consist of a hierarchical aggregation of storage locations
+ (e.g., rows, racks, hosts, etc.) and their assigned weights.
+#. Rules consist of the manner of selecting buckets
+
+
+.. _crushmapdevices:
+
+CRUSH Map Devices
+-----------------
+
+To map placement groups to OSDs, a CRUSH Map requires a list of OSD devices
+(i.e., the name of the OSD daemon). The list of devices appears first in the
+CRUSH Map. ::
+
+ #devices
+ device {num} {osd.name}
+
+For example::
+
+ #devices
+ device 0 osd.0
+ device 1 osd.1
+ device 2 osd.2
+ device 3 osd.3
+
+As a general rule, an OSD daemon maps to a single disk or to a RAID.
+
+
+.. _crushmapbuckets:
+
+CRUSH Map Buckets
+-----------------
+
+CRUSH maps support the notion of 'buckets', which may be thought of as nodes
+that aggregate other buckets into a hierarchy of physical locations, where OSD
+devices are the leaves of the hierarchy. The following table lists the default
+types.
+
++------+----------+-------------------------------------------------------+
+| Type | Location | Description |
++======+=============+====================================================+
+| 0 | OSD | An OSD daemon (e.g., osd.1, osd.2, etc). |
++------+-------------+----------------------------------------------------+
+| 1 | Host | A host name containing one or more OSDs. |
++------+-------------+----------------------------------------------------+
+| 2 | Rack | A computer rack. The default is ``unknownrack``. |
++------+-------------+----------------------------------------------------+
+| 3 | Row | A row in a series of racks. |
++------+-------------+----------------------------------------------------+
+| 4 | Room | A room containing racks and rows of hosts. |
++------+-------------+----------------------------------------------------+
+| 5 | Data Center | A physical data center containing rooms. |
++------+-------------+----------------------------------------------------+
+| 6 | Pool | A data storage pool for storing objects. |
++------+-------------+----------------------------------------------------+
+
+.. tip:: You can remove these types and create your own bucket types.
+
+Ceph's deployment tools generate a CRUSH map that contains a bucket for each
+host, and a pool named "default," which is useful for the default ``data``,
+``metadata`` and ``rbd`` pools. The remaining bucket types provide a means for
+storing information about the physical location of nodes/buckets, which makes
+cluster administration much easier when OSDs, hosts, or network hardware
+malfunction and the administrator needs access to physical hardware.
+
+.. tip: The term "bucket" used in the context of CRUSH means a Ceph pool, a
+ location, or a piece of physical hardware. It is a different concept from
+ the term "bucket" when used in the context of RADOS Gateway APIs.
+
+A bucket has a type, a unique name (string), a unique ID expressed as a negative
+integer, a weight relative to the total capacity/capability of its item(s), the
+bucket algorithm (``straw`` by default), and the hash (``0`` by default, reflecting
+CRUSH Hash ``rjenkins1``). A bucket may have one or more items. The items may
+consist of other buckets or OSDs. Items may have a weight that reflects the
+relative weight of the item.
+
+::
+
+ [bucket-type] [bucket-name] {
+ id [a unique negative numeric ID]
+ weight [the relative capacity/capability of the item(s)]
+ alg [the bucket type: uniform | list | tree | straw ]
+ hash [the hash type: 0 by default]
+ item [item-name] weight [weight]
+ }
+
+The following example illustrates how you can use buckets to aggregate a pool and
+physical locations like a datacenter, a room, a rack and a row. ::
+
+ host ceph-osd-server-1 {
+ id -17
+ alg straw
+ hash 0
+ item osd.0 weight 1.00
+ item osd.1 weight 1.00
+ }
+
+ row rack-1-row-1 {
+ id -16
+ alg straw
+ hash 0
+ item ceph-osd-server-1 2.00
+ }
+
+ rack rack-3 {
+ id -15
+ alg straw
+ hash 0
+ item rack-3-row-1 weight 2.00
+ item rack-3-row-2 weight 2.00
+ item rack-3-row-3 weight 2.00
+ item rack-3-row-4 weight 2.00
+ item rack-3-row-5 weight 2.00
+ }
+
+ rack rack-2 {
+ id -14
+ alg straw
+ hash 0
+ item rack-2-row-1 weight 2.00
+ item rack-2-row-2 weight 2.00
+ item rack-2-row-3 weight 2.00
+ item rack-2-row-4 weight 2.00
+ item rack-2-row-5 weight 2.00
+ }
+
+ rack rack-1 {
+ id -13
+ alg straw
+ hash 0
+ item rack-1-row-1 weight 2.00
+ item rack-1-row-2 weight 2.00
+ item rack-1-row-3 weight 2.00
+ item rack-1-row-4 weight 2.00
+ item rack-1-row-5 weight 2.00
+ }
+
+ room server-room-1 {
+ id -12
+ alg straw
+ hash 0
+ item rack-1 weight 10.00
+ item rack-2 weight 10.00
+ item rack-3 weight 10.00
+ }
+
+ datacenter dc-1 {
+ id -11
+ alg straw
+ hash 0
+ item server-room-1 weight 30.00
+ item server-room-2 weight 30.00
+ }
+
+ pool data {
+ id -10
+ alg straw
+ hash 0
+ item dc-1 weight 60.00
+ item dc-2 weight 60.00
+ }
+
+.. _crushmaprules:
+
+CRUSH Map Rules
+---------------
+
+CRUSH maps support the notion of 'CRUSH rules', which are the rules that
+determine data placement for a pool. For large clusters, you will likely create
+many pools where each pool may have its own CRUSH ruleset and rules. The default
+CRUSH map has a rule for each pool, and one ruleset assigned to each of the
+default pools, which include:
+
+- ``data``
+- ``metadata``
+- ``rbd``
+
+.. note:: In most cases, you will not need to modify the default rules. When
+ you create a new pool, its default ruleset is ``0``.
+
+A rule takes the following form::
+
+ rule [rulename] {
+
+ ruleset [ruleset]
+ type [type]
+ min_size [min-size]
+ max_size [max-size]
+ step [step]
+
+ }
+
+
+``ruleset``
+
+:Description: A means of classifying a rule as belonging to a set of rules. Activated by `setting the ruleset in a pool`_.
+:Purpose: A component of the rule mask.
+:Type: Integer
+:Required: Yes
+:Default: 0
+
+.. _setting the ruleset in a pool: ../pools#setpoolvalues
+
+
+``type``
+
+:Description: Describes a rule for either a hard disk (replicated) or a RAID.
+:Purpose: A component of the rule mask.
+:Type: String
+:Required: Yes
+:Default: ``replicated``
+:Valid Values: Currently only ``replicated``
+
+``min_size``
+
+:Description: If a placement group makes fewer replicas than this number, CRUSH will NOT select this rule.
+:Type: Integer
+:Purpose: A component of the rule mask.
+:Required: Yes
+:Default: ``1``
+
+``max_size``
+
+:Description: If a placement group makes more replicas than this number, CRUSH will NOT select this rule.
+:Type: Integer
+:Purpose: A component of the rule mask.
+:Required: Yes
+:Default: 10
+
+
+``step take {bucket}``
+
+:Description: Takes a bucket name, and begins iterating down the tree.
+:Purpose: A component of the rule.
+:Required: Yes
+:Example: ``step take data``
+
+
+``step choose firstn {num} type {bucket-type}``
+
+:Description: Selects the number of buckets of the given type. Where ``N`` is the number of options available, if ``{num} > 0 && < N``, choose that many buckets; if ``{num} < 0``, it means ``N - {num}``; and, if ``{num} == 0``, choose ``N`` buckets (all available).
+:Purpose: A component of the rule.
+:Prerequisite: Follows ``step take`` or ``step choose``.
+:Example: ``step choose firstn 1 type row``
+
+
+``step emit``
+
+:Description: Outputs the current value and empties the stack. Typically used at the end of a rule, but may also be used to from different trees in the same rule.
+:Purpose: A component of the rule.
+:Prerequisite: Follows ``step choose``.
+:Example: ``step emit``
+
+.. important:: To activate one or more rules with a common ruleset number to a pool, set the ruleset number to the pool.
+
+
+.. _addosd:
+
+Add/Move an OSD
+===============
+
+To add or move an OSD in the CRUSH map of a running cluster, execute the
+following::
+
+ ceph osd crush set {id} {name} {weight} pool={pool-name} [{bucket-type}={bucket-name}, ...]
+
+Where:
+
+``id``
+
+:Description: The numeric ID of the OSD.
+:Type: Integer
+:Required: Yes
+:Example: ``0``
+
+
+``name``
+
+:Description: The full name of the OSD.
+:Type: String
+:Required: Yes
+:Example: ``osd.0``
+
+
+``weight``
+
+:Description: The CRUSH weight for the OSD.
+:Type: Double
+:Required: Yes
+:Example: ``2.0``
+
+
+``pool``
+
+:Description: By default, the CRUSH hierarchy contains the pool name at its root.
+:Type: Key/value pair.
+:Required: Yes
+:Example: ``pool=data``
+
+
+``bucket-type``
+
+:Description: You may specify the OSD's location in the CRUSH hierarchy.
+:Type: Key/value pairs.
+:Required: No
+:Example: ``datacenter=dc1, room=room1, row=foo, rack=bar, host=foo-bar-1``
+
+
+The following example adds ``osd.0`` to the hierarchy, or moves the OSD from a
+previous location. ::
+
+ ceph osd crush set 0 osd.0 1.0 pool=data datacenter=dc1, room=room1, row=foo, rack=bar, host=foo-bar-1
+
+
+Adjust an OSD's CRUSH Weight
+============================
+
+To adjust an OSD's crush weight in the CRUSH map of a running cluster, execute
+the following::
+
+ ceph osd crush reweight {name} {weight}
+
+Where:
+
+``name``
+
+:Description: The full name of the OSD.
+:Type: String
+:Required: Yes
+:Example: ``osd.0``
+
+
+``weight``
+
+:Description: The CRUSH weight for the OSD.
+:Type: Double
+:Required: Yes
+:Example: ``2.0``
+
+
+.. _removeosd:
+
+Remove an OSD
+=============
+
+To remove an OSD from the CRUSH map of a running cluster, execute the following::
+
+ ceph osd crush remove {name}
+
+Where:
+
+``name``
+
+:Description: The full name of the OSD.
+:Type: String
+:Required: Yes
+:Example: ``osd.0``
+
+
+Move a Bucket
+=============
+
+To move a bucket to a different location or position in the CRUSH map hierarchy,
+execute the following::
+
+ ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
+
+Where:
+
+``bucket-name``
+
+:Description: The name of the bucket to move/reposition.
+:Type: String
+:Required: Yes
+:Example: ``foo-bar-1``
+
+``bucket-type``
+
+:Description: You may specify the bucket's location in the CRUSH hierarchy.
+:Type: Key/value pairs.
+:Required: No
+:Example: ``datacenter=dc1, room=room1, row=foo, rack=bar, host=foo-bar-1``
+
+
+Tunables
+========
+
+.. versionadded:: 0.48
+
+There are several magic numbers that were used in the original CRUSH
+implementation that have proven to be poor choices. To support
+the transition away from them, newer versions of CRUSH (starting with
+the v0.48 argonaut series) allow the values to be adjusted or tuned.
+
+Clusters running recent Ceph releases support using the tunable values
+in the CRUSH maps. However, older clients and daemons will not correctly interact
+with clusters using the "tuned" CRUSH maps. To detect this situation,
+there is now a feature bit ``CRUSH_TUNABLES`` (value 0x40000) to
+reflect support for tunables.
+
+If the OSDMap currently used by the ``ceph-mon`` or ``ceph-osd``
+daemon has non-legacy values, it will require the ``CRUSH_TUNABLES``
+feature bit from clients and daemons who connect to it. This means
+that old clients will not be able to connect.
+
+At some future point in time, newly created clusters will have
+improved default values for the tunables. This is a matter of waiting
+until the support has been present in the Linux kernel clients long
+enough to make this a painless transition for most users.
+
+Impact of Legacy Values
+-----------------------
+
+The legacy values result in several misbehaviors:
+
+ * For hiearchies with a small number of devices in the leaf buckets,
+ some PGs map to fewer than the desired number of replicas. This
+ commonly happens for hiearchies with "host" nodes with a small
+ number (1-3) of OSDs nested beneath each one.
+
+ * For large clusters, some small percentages of PGs map to less than
+ the desired number of OSDs. This is more prevalent when there are
+ several layers of the hierarchy (e.g., row, rack, host, osd).
+
+ * When some OSDs are marked out, the data tends to get redistributed
+ to nearby OSDs instead of across the entire hierarchy.
+
+Which client versions support tunables
+--------------------------------------
+
+ * argonaut series, v0.48.1 or later
+ * v0.49 or later
+ * Linux kernel version v3.5 or later (for the file system and RBD kernel clients)
+
+A few important points
+----------------------
+
+ * Adjusting these values will result in the shift of some PGs between
+ storage nodes. If the Ceph cluster is already storing a lot of
+ data, be prepared for some fraction of the data to move.
+ * The ``ceph-osd`` and ``ceph-mon`` daemons will start requiring the
+ ``CRUSH_TUNABLES`` feature of new connections as soon as they get
+ the updated map. However, already-connected clients are
+ effectively grandfathered in, and will misbehave if they do not
+ support the new feature.
+ * If the CRUSH tunables are set to non-legacy values and then later
+ changed back to the defult values, ``ceph-osd`` daemons will not be
+ required to support the feature. However, the OSD peering process
+ requires examining and understanding old maps. Therefore, you
+ should not run old (pre-v0.48) versions of the ``ceph-osd`` daemon
+ if the cluster has previosly used non-legacy CRUSH values, even if
+ the latest version of the map has been switched back to using the
+ legacy defaults.
+
+Tuning CRUSH
+------------
+
+If you can ensure that all clients are running recent code, you can
+adjust the tunables by extracting the CRUSH map, modifying the values,
+and reinjecting it into the cluster.
+
+* Extract the latest CRUSH map::
+
+ ceph osd getcrushmap -o /tmp/crush
+
+* Adjust tunables. These values appear to offer the best behavior
+ for both large and small clusters we tested with. You will need to
+ additionally specify the ``--enable-unsafe-tunables`` argument to
+ ``crushtool`` for this to work. Please use this option with
+ extreme care.::
+
+ crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
+
+* Reinject modified map::
+
+ ceph osd setcrushmap -i /tmp/crush.new
+
+Legacy values
+-------------
+
+For reference, the legacy values for the CRUSH tunables can be set
+with::
+
+ crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 -o /tmp/crush.legacy
+
+Again, the special ``--enable-unsafe-tunables`` option is required.
+Further, as noted above, be careful running old versions of the
+``ceph-osd`` daemon after reverting to legacy values as the feature
+bit is not perfectly enforced.
+
--- /dev/null
+=========================
+ Data Placement Overview
+=========================
+
+Ceph stores, replicates and rebalances data objects across a RADOS cluster
+dynamically. With many different users storing objects in different pools for
+different purposes on countless OSDs, Ceph operations require some data
+placement planning. The main data placement planning concepts in Ceph include:
+
+- **Pools:** Ceph stores data within pools, which are logical groups for storing
+ objects. Pools manage the number of placement groups, the number of replicas,
+ and the ruleset for the pool. To store data in a pool, you must have
+ an authenticated user with permissions for the pool. Ceph can snapshot pools.
+ Future versions of Ceph will support namespaces within pools.
+
+- **Placement Groups:** Ceph maps objects to placement groups (PGs).
+ Placement groups (PGs) are shards or fragments of a logical object pool
+ that place objects as a group into OSDs. Placement groups reduce the amount
+ of per-object metadata when Ceph stores the data in OSDs. A larger number of
+ placement groups (e.g., 100 per OSD) leads to better balancing.
+
+- **CRUSH Maps:** CRUSH is a big part of what allows Ceph to scale without
+ performance bottlenecks, without limitations to scalability, and without a
+ single point of failure. CRUSH maps provide the physical topology of the
+ cluster to the CRUSH algorithm to determine where the data for an object
+ and its replicas should be stored, and how to do so across failure domains
+ for added data safety among other things.
+
+When you initially set up a test cluster, you can use the default values. Once
+you begin planning for a large Ceph cluster, refer to pools, placement groups
+and CRUSH for data placement operations. If you find some aspects challenging,
+`Inktank`_ provides excellent premium support for Ceph.
+
+.. _Inktank: http://www.inktank.com
\ No newline at end of file
--- /dev/null
+=======================
+ Debugging and Logging
+=======================
+
+You may view Ceph log files under ``/var/log/ceph`` (the default location).
+
+Ceph is still on the leading edge, so you may encounter situations that require
+using Ceph's debugging and logging. To activate and configure Ceph's debug
+logging, refer to `Ceph Logging and Debugging`_. For additional logging
+settings, refer to the `Logging and Debugging Config Reference`_.
+
+.. _Ceph Logging and Debugging: ../../config-cluster/ceph-conf#ceph-logging-and-debugging
+.. _Logging and Debugging Config Reference: ../../config-cluster/log-and-debug-ref
+
+You can change the logging settings at runtime so that you don't have to
+stop and restart the cluster. Refer to `Ceph Configuration - Runtime Changes`_
+for additional details.
+
+Debugging may also require you to track down memory and threading issues.
+You can run a single daemon, a type of daemon, or the whole cluster with
+Valgrind. You should only use Valgrind when developing or debugging Ceph.
+Valgrind is computationally expensive, and will slow down your system otherwise.
+Valgrind messages are logged to ``stderr``.
+
+.. _Ceph Configuration - Runtime Changes: ../../config-cluster/ceph-conf#ceph-runtime-config
--- /dev/null
+====================
+ Cluster Operations
+====================
+
+.. raw:: html
+
+ <table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>High-level Operations</h3>
+
+High-level cluster operations consist primarily of starting, stopping, and
+restarting a cluster with the ``ceph`` service; checking the cluster's health;
+and, monitoring an operating cluster.
+
+.. toctree::
+
+ operating
+ monitoring
+ troubleshooting
+ debug
+
+.. raw:: html
+
+ </td><td><h3>Data Placement</h3>
+
+Once you have your cluster up and running, you may begin working with data
+placement. Ceph supports petabyte-scale data storage clusters, with storage
+pools and placement groups that distribute data across the cluster using Ceph's
+CRUSH algorithm.
+
+.. toctree::
+
+ data-placement
+ pools
+ placement-groups
+ crush-map
+
+
+
+.. raw:: html
+
+ </td></tr><tr><td><h3>Authentication and Authorization</h3>
+
+Once you have data placement policies in place, you can begin creating users
+and assigning them capabilities, such as the ability to read and write data
+to one or more pools, or the cluster as a whole.
+
+.. toctree::
+
+ Cephx Overview <auth-intro>
+ authentication
+
+
+
+.. raw:: html
+
+ </td><td><h3>Daemon Operations</h3>
+
+Low-level cluster operations consist of starting, stopping, and restarting a
+particular daemon within a cluster; changing the settings of a particular
+daemon or subsystem; and, adding a daemon to the cluster or removing a daemon
+from the cluster. The most common use cases for low-level operations include
+growing or shrinking the Ceph cluster and replacing legacy or failed hardware
+with new hardware.
+
+.. toctree::
+
+ add-or-rm-osds
+ add-or-rm-mons
+ Command Reference <control>
+
+
+.. raw:: html
+
+ </td></tr></tbody></table>
+
--- /dev/null
+======================
+ Monitoring a Cluster
+======================
+
+Once you have a running cluster, you may use the ``ceph`` tool to monitor your
+cluster. Monitoring a cluster typically involves checking OSD status, monitor
+status, placement group status and metadata server status.
+
+Interactive Mode
+================
+
+To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line
+with no arguments. For example::
+
+ ceph
+ ceph> health
+ ceph> status
+ ceph> quorum_status
+ ceph> mon_status
+
+
+Checking Cluster Health
+=======================
+
+After you start your cluster, and before you start reading and/or
+writing data, check your cluster's health first. You can check on the
+health of your Ceph cluster with the following::
+
+ ceph health
+
+If you specified non-default locations for your configuration or keyring,
+you may specify their locations::
+
+ ceph -c /path/to/conf -k /path/to/keyring health
+
+Upon starting the Ceph cluster, you will likely encounter a health
+warning such as ``HEALTH_WARN XXX num placement groups stale``. Wait a few moments and check
+it again. When your cluster is ready, ``ceph health`` should return a message
+such as ``HEALTH_OK``. At that point, it is okay to begin using the cluster.
+
+Watching a Cluster
+==================
+
+To watch the cluster's ongoing events, open a new terminal. Then, enter::
+
+ ceph -w
+
+Ceph will print each version of the placement group map and their status. For
+example, a tiny Ceph cluster consisting of one monitor, one metadata server and
+two OSDs may print the following::
+
+ health HEALTH_OK
+ monmap e1: 1 mons at {a=192.168.0.1:6789/0}, election epoch 0, quorum 0 a
+ osdmap e13: 2 osds: 2 up, 2 in
+ placement groupmap v9713: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
+ mdsmap e4: 1/1/1 up {0=a=up:active}
+
+ 2012-08-01 11:33:53.831268 mon.0 [INF] placement groupmap v9712: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
+ 2012-08-01 11:35:31.904650 mon.0 [INF] placement groupmap v9713: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
+ 2012-08-01 11:35:53.903189 mon.0 [INF] placement groupmap v9714: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
+ 2012-08-01 11:37:31.865809 mon.0 [INF] placement groupmap v9715: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
+
+
+Checking a Cluster's Status
+===========================
+
+To check a cluster's status, execute the following::
+
+ ceph status
+
+Or::
+
+ ceph -s
+
+In interactive mode, type ``status`` and press **Enter**. ::
+
+ ceph> status
+
+Ceph will print the cluster status. For example, a tiny Ceph cluster consisting
+of one monitor, one metadata server and two OSDs may print the following::
+
+ health HEALTH_OK
+ monmap e1: 1 mons at {a=192.168.0.1:6789/0}, election epoch 0, quorum 0 a
+ osdmap e13: 2 osds: 2 up, 2 in
+ placement groupmap v9754: 384 placement groups: 384 active+clean; 8730 bytes data, 22948 MB used, 264 GB / 302 GB avail
+ mdsmap e4: 1/1/1 up {0=a=up:active}
+
+
+Checking OSD Status
+===================
+
+An OSD's status is either in the cluster (``in``) or out of the
+cluster (``out``); and, it is either up and running (``up``), or it is down and
+not running (``down``). If an OSD is ``up``, it may be either ``in`` in the
+cluster (you can read and write data) or it is out of the cluster ``out``. If
+it is ``down``, it should also be ``out``. If an OSD is ``down`` and ``in``,
+there is a problem.
+
+.. ditaa:: +----------------+ +----------------+
+ | | | |
+ | OSD #n In | | OSD #n Up |
+ | | | |
+ +----------------+ +----------------+
+ ^ ^
+ | |
+ | |
+ v v
+ +----------------+ +----------------+
+ | | | |
+ | OSD #n Out | | OSD #n Down |
+ | | | |
+ +----------------+ +----------------+
+
+You can check OSDs to ensure they are ``up`` and ``in`` by executing::
+
+ ceph osd stat
+
+Or::
+
+ ceph osd dump
+
+You can also check view OSDs according to their position in the CRUSH map. ::
+
+ ceph osd tree
+
+Ceph will print out a CRUSH tree with a host, its OSDs, whether they are up
+and their weight. ::
+
+ # id weight type name up/down reweight
+ -1 3 pool default
+ -3 3 rack mainrack
+ -2 3 host osd-host
+ 0 1 osd.0 up 1
+ 1 1 osd.1 up 1
+ 2 1 osd.2 up 1
+
+
+Checking Monitor Status
+=======================
+
+If your cluster has multiple monitors (likely), you should check the monitor
+quorum status after you start the cluster before reading and/or writing data. A
+quorum must be present when multiple monitors are running. You should also check
+monitor status periodically to ensure that they are running.
+
+To see display the monitor map, execute the following::
+
+ ceph mon stat
+
+Or::
+
+ ceph mon dump
+
+To check the quorum status for the monitor cluster, execute the following::
+
+ ceph quorum_status
+
+Ceph will return the quorum status. For example, a Ceph cluster consisting of
+three monitors may return the following:
+
+.. code-block:: javascript
+
+ { "election_epoch": 10,
+ "quorum": [
+ 0,
+ 1,
+ 2],
+ "monmap": { "epoch": 1,
+ "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
+ "modified": "2011-12-12 13:28:27.505520",
+ "created": "2011-12-12 13:28:27.505520",
+ "mons": [
+ { "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:6789\/0"},
+ { "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:6790\/0"},
+ { "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:6791\/0"}
+ ]
+ }
+ }
+
+Checking MDS Status
+===================
+
+Metadata servers provide metadata services for Ceph FS. Metadata servers have
+two sets of states: ``up | down`` and ``active | inactive``. To ensure your
+metadata servers are ``up`` and ``active``, execute the following::
+
+ ceph mds stat
+
+To display details of the metadata cluster, execute the following::
+
+ ceph mds dump
+
+
+Checking Placement Group States
+===============================
+
+Placement groups map objects to OSDs. When you monitor your
+placement groups, you will want them to be ``active`` and ``clean``. For other
+PG states, see `Placement Group States`_.
+
+.. _Placement Group States: ../pg-states
--- /dev/null
+=====================
+ Operating a Cluster
+=====================
+
+The ``ceph`` service provides functionality to **start**, **restart**, and
+**stop** your Ceph cluster. Each time you execute ``ceph`` processes, you
+must specify at least one option and one command. You may also specify a daemon
+type or a daemon instance. For most newer Debian/Ubuntu distributions, you may
+use the following syntax::
+
+ sudo service ceph [options] [commands] [daemons]
+
+For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
+
+ sudo /etc/init.d/ceph [options] [commands] [daemons]
+
+The ``ceph`` service options include:
+
++-----------------+----------+-------------------------------------------------+
+| Option | Shortcut | Description |
++=================+==========+=================================================+
+| ``--verbose`` | ``-v`` | Use verbose logging. |
++-----------------+----------+-------------------------------------------------+
+| ``--valgrind`` | ``N/A`` | (Dev and QA only) Use `Valgrind`_ debugging. |
++-----------------+----------+-------------------------------------------------+
+| ``--allhosts`` | ``-a`` | Execute on all hosts in ``ceph.conf.`` |
+| | | Otherwise, it only executes on ``localhost``. |
++-----------------+----------+-------------------------------------------------+
+| ``--restart`` | ``N/A`` | Automatically restart daemon if it core dumps. |
++-----------------+----------+-------------------------------------------------+
+| ``--norestart`` | ``N/A`` | Don't restart a daemon if it core dumps. |
++-----------------+----------+-------------------------------------------------+
+| ``--conf`` | ``-c`` | Use an alternate configuration file. |
++-----------------+----------+-------------------------------------------------+
+
+The ``ceph`` service commands include:
+
++------------------+------------------------------------------------------------+
+| Command | Description |
++==================+============================================================+
+| ``start`` | Start the daemon(s). |
++------------------+------------------------------------------------------------+
+| ``stop`` | Stop the daemon(s). |
++------------------+------------------------------------------------------------+
+| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9`` |
++------------------+------------------------------------------------------------+
+| ``killall`` | Kill all daemons of a particular type. |
++------------------+------------------------------------------------------------+
+| ``cleanlogs`` | Cleans out the log directory. |
++------------------+------------------------------------------------------------+
+| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
++------------------+------------------------------------------------------------+
+
+For subsystem operations, the ``ceph`` service can target specific daemon types by
+adding a particular daemon type for the ``[daemons]`` option. Daemon types include:
+
+- ``mon``
+- ``osd``
+- ``mds``
+
+The ``ceph`` service's ``[daemons]`` setting may also target a specific instance::
+
+ sudo /etc/init.d/ceph -a start osd.0
+
+Where ``osd.0`` is the first OSD in the cluster.
+
+
+Starting a Cluster
+==================
+
+To start your Ceph cluster, execute ``ceph`` with the ``start`` command.
+The usage may differ based upon your Linux distribution. For example, for most
+newer Debian/Ubuntu distributions, you may use the following syntax::
+
+ sudo service ceph start [options] [start|restart] [daemonType|daemonID]
+
+For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
+
+ sudo /etc/init.d/ceph [options] [start|restart] [daemonType|daemonID]
+
+The following examples illustrates a typical use case::
+
+ sudo service ceph -a start
+ sudo /etc/init.d/ceph -a start
+
+Once you execute with ``-a``, Ceph should begin operating. You may also specify
+a particular daemon instance to constrain the command to a single instance. For
+example::
+
+ sudo /etc/init.d/ceph start osd.0
+
+
+Stopping a Cluster
+==================
+
+To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command.
+The usage may differ based upon your Linux distribution. For example, for most
+newer Debian/Ubuntu distributions, you may use the following syntax::
+
+ sudo service ceph [options] stop [daemonType|daemonID]
+
+For example::
+
+ sudo service -a ceph stop
+
+For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
+
+ sudo /etc/init.d/ceph -a stop
+
+Ceph should shut down the operating processes.
+
+
+.. _Valgrind: http://www.valgrind.org/
\ No newline at end of file
--- /dev/null
+==========================
+ Placement Group Concepts
+==========================
+
+When you execute commands like ``ceph -w``, ``ceph osd dump``, and other
+commands related to placement groups, Ceph may return values using some
+of the following terms:
+
+*Peering*
+ The process of bringing all of the OSDs that store
+ a Placement Group (PG) into agreement about the state
+ of all of the objects (and their metadata) in that PG.
+ Note that agreeing on the state does not mean that
+ they all have the latest contents.
+
+*Acting Set*
+ The ordered list of OSDs who are (or were as of some epoch)
+ responsible for a particular placement group.
+
+*Up Set*
+ The ordered list of OSDs responsible for a particular placment
+ group for a particular epoch according to CRUSH. Normally this
+ is the same as the *Acting Set*, except when the *Acting Set* has
+ been explicitly overridden via ``pg_temp`` in the OSD Map.
+
+*Current Interval* or *Past Interval*
+ A sequence of OSD map epochs during which the *Acting Set* and *Up
+ Set* for particular placement group do not change.
+
+*Primary*
+ The member (and by convention first) of the *Acting Set*,
+ that is responsible for coordination peering, and is
+ the only OSD that will accept client-initiated
+ writes to objects in a placement group.
+
+*Replica*
+ A non-primary OSD in the *Acting Set* for a placement group
+ (and who has been recognized as such and *activated* by the primary).
+
+*Stray*
+ An OSD that is not a member of the current *Acting Set*, but
+ has not yet been told that it can delete its copies of a
+ particular placement group.
+
+*Recovery*
+ Ensuring that copies of all of the objects in a placement group
+ are on all of the OSDs in the *Acting Set*. Once *Peering* has
+ been performed, the *Primary* can start accepting write operations,
+ and *Recovery* can proceed in the background.
+
+*PG Info*
+ Basic metadata about the placement group's creation epoch, the version
+ for the most recent write to the placement group, *last epoch started*,
+ *last epoch clean*, and the beginning of the *current interval*. Any
+ inter-OSD communication about placement groups includes the *PG Info*,
+ such that any OSD that knows a placement group exists (or once existed)
+ also has a lower bound on *last epoch clean* or *last epoch started*.
+
+*PG Log*
+ A list of recent updates made to objects in a placement group.
+ Note that these logs can be truncated after all OSDs
+ in the *Acting Set* have acknowledged up to a certain
+ point.
+
+*Missing Set*
+ Each OSD notes update log entries and if they imply updates to
+ the contents of an object, adds that object to a list of needed
+ updates. This list is called the *Missing Set* for that ``<OSD,PG>``.
+
+*Authoritative History*
+ A complete, and fully ordered set of operations that, if
+ performed, would bring an OSD's copy of a placement group
+ up to date.
+
+*Epoch*
+ A (monotonically increasing) OSD map version number
+
+*Last Epoch Start*
+ The last epoch at which all nodes in the *Acting Set*
+ for a particular placement group agreed on an
+ *Authoritative History*. At this point, *Peering* is
+ deemed to have been successful.
+
+*up_thru*
+ Before a *Primary* can successfully complete the *Peering* process,
+ it must inform a monitor that is alive through the current
+ osd map *Epoch* by having the monitor set its *up_thru* in the osd
+ map. This helps *Peering* ignore previous *Acting Sets* for which
+ *Peering* never completed after certain sequences of failures, such as
+ the second interval below:
+
+ - *acting set* = [A,B]
+ - *acting set* = [A]
+ - *acting set* = [] very shortly after (e.g., simultaneous failure, but staggered detection)
+ - *acting set* = [B] (B restarts, A does not)
+
+*Last Epoch Clean*
+ The last *Epoch* at which all nodes in the *Acting set*
+ for a particular placement group were completely
+ up to date (both placement group logs and object contents).
+ At this point, *recovery* is deemed to have been
+ completed.
--- /dev/null
+========================
+ Placement Group States
+========================
+
+When checking a cluster's status (e.g., running ``ceph -w`` or ``ceph -s``),
+Ceph will report on the status of the placement groups. A placement group has
+one or more states. The optimum state for placement groups in the placement group
+map is ``active + clean``.
+
+*Creating*
+ Ceph is still creating the placement group.
+
+*Active*
+ Ceph will process requests to the placement group.
+
+*Clean*
+ Ceph replicated all objects in the placement group the correct number of times.
+
+*Down*
+ A replica with necessary data is down, so the placement group is offline.
+
+*Replay*
+ The placement group is waiting for clients to replay operations after an OSD crashed.
+
+*Splitting*
+ Ceph is splitting the placment group into multiple placement groups. (functional?)
+
+*Scrubbing*
+ Ceph is checking the placement group for inconsistencies.
+
+*Degraded*
+ Ceph has not replicated some objects in the placement group the correct number of times yet.
+
+*Inconsistent*
+ Ceph detects inconsistencies in the one or more replicas of an object in the placement group
+ (e.g. objects are the wrong size, objects are missing from one replica *after* recovery finished, etc.).
+
+*Peering*
+ The placement group is undergoing the peering process
+
+*Repair*
+ Ceph is checking the placement group and repairing any inconsistencies it finds (if possible).
+
+*Recovering*
+ Ceph is migrating/synchronizing objects and their replicas.
+
+*Backfill*
+ Ceph is scanning and synchronizing the entire contents of a placement group
+ instead of inferring what contents need to be synchronized from the logs of
+ recent operations. *Backfill* is a special case of recovery.
+
+*Wait-backfill*
+ The placement group is waiting in line to start backfill.
+
+*Incomplete*
+ Ceph detects that a placement group is missing a necessary period of history
+ from its log. If you see this state, report a bug, and try to start any
+ failed OSDs that may contain the needed information.
+
+*Stale*
+ The placement group is in an unknown state - the monitors have not received
+ an update for it since the placement group mapping changed.
+
+*Remapped*
+ The placement group is temporarily mapped to a different set of OSDs from what
+ CRUSH specified.
--- /dev/null
+==================
+ Placement Groups
+==================
+
+A Placement Group (PG) aggregates a series of objects into a group, and maps the
+group to a series of OSDs. Tracking object placement and object metadata on a
+per-object basis is computationally expensive--i.e., a system with millions of
+objects cannot realistically track placement on a per-object basis. Placement
+groups address this barrier to performance and scalability. Additionally,
+placement groups reduce the number of processes and the amount of per-object
+metadata Ceph must track when storing and retrieving data.
+
+.. ditaa::
+ /-----\ /-----\ /-----\ /-----\ /-----\
+ | obj | | obj | | obj | | obj | | obj |
+ \-----/ \-----/ \-----/ \-----/ \-----/
+ | | | | |
+ +--------+--------+ +---+----+
+ | |
+ v v
+ +-----------------------+ +-----------------------+
+ | Placement Group #1 | | Placement Group #2 |
+ | | | |
+ +-----------------------+ +-----------------------+
+ | |
+ | +-----------------------+---+
+ +------+------+-------------+ |
+ | | | |
+ v v v v
+ /----------\ /----------\ /----------\ /----------\
+ | | | | | | | |
+ | OSD #1 | | OSD #2 | | OSD #3 | | OSD #4 |
+ | | | | | | | |
+ \----------/ \----------/ \----------/ \----------/
+
+Each placement group requires some amount of system resources:
+
+- **Directly**: Each PG requires some amount of memory and CPU.
+- **Indirectly**: The total number of PGs increases the peering count.
+
+Increasing the number of placement groups reduces the variance in per-OSD load
+across your cluster. We recommend approximately 50-100 placement groups per OSD
+to balance out memory and CPU requirements and per-OSD load. For a single pool
+of objects, you can use the following formula::
+
+ (OSDs * 100)
+ Total PGs = ------------
+ Replicas
+
+When using multiple data pools for storing objects, you need to ensure that you
+balance the number of placement groups per pool with the number of placement
+groups per OSD so that you arrive at a reasonable total number of placement
+groups that provides reasonably low variance per OSD without taxing system
+resources or making the peering process too slow.
+
+.. _setting the number of placement groups:
+
+Set the Number of Placement Groups
+==================================
+
+To set the number of placement groups in a pool, you must specify the
+number of placement groups at the time you create the pool.
+
+See `Create a Pool`_ for details.
+
+.. _Create a Pool: ../pools#createpool
+
+Get the Number of Placement Groups
+==================================
+
+To get the number of placement groups in a pool, execute the following::
+
+ ceph osd pool get {pool-name} pg_num
+
+
+Get a Cluster's PG Statistics
+=============================
+
+To get the statistics for the placement groups in your cluster, execute the following::
+
+ ceph pg dump [--format {format}]
+
+Valid formats are ``plain`` (default) and ``json``.
+
+
+Get Statistics for Stuck PGs
+============================
+
+To get the statistics for all placement groups stuck in a specified state,
+execute the following::
+
+ ceph pg dump_stuck inactive|unclean|stale [--format <format>] [-t|--threshold <seconds>]
+
+**Inactive** Placement groups cannot process reads or writes because they are waiting for an OSD
+with the most up-to-date data to come up and in.
+
+**Unclean** Placement groups contain objects that are not replicated the desired number
+of times. They should be recovering.
+
+**Stale** Placement groups are in an unknown state - the OSDs that host them have not
+reported to the monitor cluster in a while (configured by ``mon_osd_report_timeout``).
+
+Valid formats are ``plain`` (default) and ``json``. The threshold defines the minimum number
+of seconds the placement group is stuck before including it in the returned statistics
+(default 300 seconds).
+
+
+Get a PG Map
+============
+
+To get the placement group map for a particular placement group, execute the following::
+
+ ceph pg map {pg-id}
+
+For example::
+
+ ceph pg map 1.6c
+
+Ceph will return the placement group map, the placement group, and the OSD status::
+
+ osdmap e13 pg 1.6c (1.6c) -> up [1,0] acting [1,0]
+
+
+Get a PGs Statistics
+====================
+
+To retrieve statistics for a particular placement group, execute the following::
+
+ ceph pg {pg-id} query
+
+
+Scrub a Placement Group
+=======================
+
+To scrub a placement group, execute the following::
+
+ ceph pg scrub {pg-id}
+
+Ceph checks the primary and any replica nodes, generates a catalog of all objects
+in the placement group and compares them to ensure that no objects are missing
+or mismatched, and their contents are consistent. Assuming the replicas all
+match, a final semantic sweep ensures that all of the snapshot-related object
+metadata is consistent. Errors are reported via logs.
+
+
+Revert Lost
+===========
+
+If the cluster has lost one or more objects, and you have decided to
+abandon the search for the lost data, you must mark the unfound objects
+as ``lost``.
+
+If all possible locations have been queried and objects are still
+lost, you may have to give up on the lost objects. This is
+possible given unusual combinations of failures that allow the cluster
+to learn about writes that were performed before the writes themselves
+are recovered.
+
+Currently the only supported option is "revert", which will either roll back to
+a previous version of the object or (if it was a new object) forget about it
+entirely. To mark the "unfound" objects as "lost", execute the following::
+
+ ceph pg {pg-id} mark_unfound_lost revert
+
+.. important:: Use this feature with caution, because it may confuse
+ applications that expect the object(s) to exist.
+
+
+.. toctree::
+ :hidden:
+
+ pg-states
+ pg-concepts
--- /dev/null
+=======
+ Pools
+=======
+
+When you first deploy a cluster without creating a pool, Ceph uses the default
+pools for storing data. A pool differs from CRUSH's location-based buckets in
+that a pool doesn't have a single physical location, and a pool provides you
+with some additional functionality, including:
+
+- **Replicas**: You can set the desired number of copies/replicas of an object.
+ A typical configuration stores an object and one additional copy
+ (i.e., ``size = 2``), but you can determine the number of copies/replicas.
+
+- **Placement Groups**: You can set the number of placement groups for the pool.
+ A typical configuration uses approximately 100 placement groups per OSD to
+ provide optimal balancing without using up too many computing resources. When
+ setting up multiple pools, be careful to ensure you set a reasonable number of
+ placement groups for both the pool and the cluster as a whole.
+
+- **CRUSH Rules**: When you store data in a pool, a CRUSH ruleset mapped to the
+ pool enables CRUSH to identify a rule for the placement of the primary object
+ and object replicas in your cluster. You can create a custom CRUSH rule for your
+ pool.
+
+- **Snapshots**: When you create snapshots with ``ceph osd pool mksnap``,
+ you effectively take a snapshot of a particular pool.
+
+- **Set Ownership**: You can set a user ID as the owner of a pool.
+
+To organize data into pools, you can list, create, and remove pools.
+You can also view the utilization statistics for each pool.
+
+
+List Pools
+==========
+
+To list your cluster's pools, execute::
+
+ ceph osd lspools
+
+The default pools include:
+
+- ``data``
+- ``metadata``
+- ``rbd``
+
+
+.. _createpool:
+
+Create a Pool
+=============
+
+To create a pool, execute::
+
+ ceph osd pool create {pool-name} {pg-num} [{pgp-num}]
+
+Where:
+
+``{pool-name}``
+
+:Description: The name of the pool. It must be unique.
+:Type: String
+:Required: Yes
+
+``{pg-num}``
+
+:Description: The total number of placement groups for the pool
+:Type: Integer
+:Required: No
+
+``{pgp-num}``
+
+:Description: The total number of placement groups for placement purposes.
+:Type: Integer
+:Required: No
+
+When you create a pool, you should consider setting the number of
+placement groups.
+
+.. important:: You cannot change the number of placement groups in a pool
+ after you create it.
+
+See `Placement Groups`_ for details on calculating an appropriate number of
+placement groups for your pool.
+
+.. _Placement Groups: ../placement-groups
+
+
+Delete a Pool
+=============
+
+To delete a pool, execute::
+
+ ceph osd pool delete {pool-name}
+
+
+If you created your own rulesets and rules for a pool you created, you should
+consider removing them when you no longer need your pool. If you created users
+with permissions strictly for a pool that no longer exists, you should consider
+deleting those users too.
+
+
+Rename a Pool
+=============
+
+To rename a pool, execute::
+
+ ceph osd pool rename {current-pool-name} {new-pool-name}
+
+If you rename a pool and you have per-pool capabilities for an authenticated
+user, you must update the user's capabilities (i.e., caps) with the new pool
+name.
+
+.. note: Version ``0.48`` Argonaut and above.
+
+Show Pool Statistics
+====================
+
+To show a pool's utilization statistics, execute::
+
+ rados df
+
+
+Make a Snapshot of a Pool
+=========================
+
+To make a snapshot of a pool, execute::
+
+ ceph osd pool mksnap {pool-name} {snap-name}
+
+.. note: Version ``0.48`` Argonaut and above.
+
+
+Remove a Snapshot of a Pool
+===========================
+
+To remove a snapshot of a pool, execute::
+
+ ceph osd pool rmsnap {pool-name} {snap-name}
+
+.. note: Version ``0.48`` Argonaut and above.
+
+.. _setpoolvalues:
+
+Set Pool Values
+===============
+
+To set a value to a pool, execute the following::
+
+ ceph osd pool set {pool-name} {key} {value}
+
+You may set values for the following keys:
+
+``size``
+
+:Description: Sets the number of replicas for objects in the pool. See `Set the Number of Object Replicas`_ for further details.
+:Type: Integer
+
+``min_size``
+
+:Description: Sets the minimum number of replicas required for io. See `Set the Number of Object Replicas`_ for further details
+:Type: Integer
+
+.. note: Version ``0.54`` and above
+
+``crash_replay_interval``
+
+:Description: The number of seconds to allow clients to replay acknowledged, but uncommitted requests.
+:Type: Integer
+
+
+``pgp_num``
+
+:Description: The effective number of placement groups to use when calculating data placement.
+:Type: Integer
+:Valid Range: Equal to or less than ``pg_num``.
+
+
+``crush_ruleset``
+
+:Description: The ruleset to use for mapping object placement in the cluster.
+:Type: Integer
+
+
+.. note: Version ``0.48`` Argonaut and above.
+
+
+Get Pool Values
+===============
+
+To set a value to a pool, execute the following::
+
+ ceph osd pool get {pool-name} {key}
+
+
+``pg_num``
+
+:Description: The number of placement groups for the pool.
+:Type: Integer
+
+
+``pgp_num``
+
+:Description: The effective number of placement groups to use when calculating data placement.
+:Type: Integer
+:Valid Range: Equal to or less than ``pg_num``.
+
+
+Set the Number of Object Replicas
+=================================
+
+To set the number of object replicas, execute the following::
+
+ ceph osd pool set {poolname} size {num-replicas}
+
+.. important: The ``{num-replicas}`` includes the object itself.
+ If you want the object and two copies of the object for a total of
+ three instances of the object, specify ``3``.
+
+For example::
+
+ ceph osd pool set data size 3
+
+You may execute this command for each pool.
+
+Note, however, that pool size is more of a best-effort setting: an object
+might accept ios in degraded mode with fewer than size replicas. To
+set a minimum number of required replicas for io, you should use the
+min_size setting.
+
+For example::
+
+ ceph osd pool set data min_size 2
+
+This ensures that no object in the data pool will receive io with fewer than
+min_size replicas.
+
+
+Get the Number of Object Replicas
+=================================
+
+To get the number of object replicas, execute the following::
+
+ ceph osd dump | grep 'rep size'
+
+Ceph will list the pools, with the ``rep size`` attribute highlighted.
+By default, Ceph creates two replicas of an object (two copies).
--- /dev/null
+==================================
+ Recovering from Monitor Failures
+==================================
+
+In production clusters, we recommend running the cluster with a minimum
+of three monitors. The failure of a single monitor should not take down
+the entire monitor cluster, provided a majority of the monitors remain
+available. If the majority of nodes are available, the remaining nodes
+will be able to form a quorum.
+
+When you check your cluster's health, you may notice that a monitor
+has failed. For example::
+
+ ceph health
+ HEALTH_WARN 1 mons down, quorum 0,2
+
+For additional detail, you may check the cluster status::
+
+ ceph status
+ HEALTH_WARN 1 mons down, quorum 0,2
+ mon.b (rank 1) addr 192.168.106.220:6790/0 is down (out of quorum)
+
+In most cases, you can simply restart the affected node.
+For example::
+
+ service ceph -a restart {failed-mon}
+
+If there are not enough monitors to form a quorum, the ``ceph``
+command will block trying to reach the cluster. In this situation,
+you need to get enough ``ceph-mon`` daemons running to form a quorum
+before doing anything else with the cluster.
\ No newline at end of file
--- /dev/null
+==============================
+ Recovering from OSD Failures
+==============================
+
+Single OSD Failure
+==================
+
+When a ``ceph-osd`` process dies, the monitor will learn about the failure
+from surviving ``ceph-osd`` daemons and report it via the ``ceph health``
+command::
+
+ ceph health
+ HEALTH_WARN 1/3 in osds are down
+
+Specifically, you will get a warning whenever there are ``ceph-osd``
+processes that are marked ``in`` and ``down``. You can identify which
+``ceph-osds`` are ``down`` with::
+
+ ceph health detail
+ HEALTH_WARN 1/3 in osds are down
+ osd.0 is down since epoch 23, last address 192.168.106.220:6800/11080
+
+Under normal circumstances, simply restarting the ``ceph-osd`` daemon will
+allow it to rejoin the cluster and recover. If there is a disk
+failure or other fault preventing ``ceph-osd`` from functioning or
+restarting, an error message should be present in its log file in
+``/var/log/ceph``.
+
+If the daemon stopped because of a heartbeat failure, the underlying
+kernel file system may be unresponsive. Check ``dmesg`` output for disk
+or other kernel errors.
+
+If the problem is a software error (failed assertion or other
+unexpected error), it should be reported to the :ref:`mailing list
+<mailing-list>`.
+
+
+The Cluster Has No Free Disk Space
+==================================
+
+If the cluster fills up, the monitor will prevent new data from being
+written. The system puts ``ceph-osds`` in two categories: ``nearfull``
+and ``full``, with configurable threshholds for each (80% and 90% by
+default). In both cases, full ``ceph-osds`` will be reported by ``ceph health``::
+
+ ceph health
+ HEALTH_WARN 1 nearfull osds
+ osd.2 is near full at 85%
+
+Or::
+
+ ceph health
+ HEALTH_ERR 1 nearfull osds, 1 full osds
+ osd.2 is near full at 85%
+ osd.3 is full at 97%
+
+The best way to deal with a full cluster is to add new ``ceph-osds``,
+allowing the cluster to redistribute data to the newly available
+storage.
+
+
+Homeless Placement Groups
+=========================
+
+It is possible for all OSDs that had copies of a given placement groups to fail.
+If that's the case, that subset of the object store is unavailable, and the
+monitor will receive no status updates for those placement groups. To detect
+this situation, the monitor marks any placement group whose primary OSD has
+failed as ``stale``. For example::
+
+ ceph health
+ HEALTH_WARN 24 pgs stale; 3/300 in osds are down
+
+You can identify which placement groups are ``stale``, and what the last OSDs to
+store them were, with::
+
+ ceph health detail
+ HEALTH_WARN 24 pgs stale; 3/300 in osds are down
+ ...
+ pg 2.5 is stuck stale+active+remapped, last acting [2,0]
+ ...
+ osd.10 is down since epoch 23, last address 192.168.106.220:6800/11080
+ osd.11 is down since epoch 13, last address 192.168.106.220:6803/11539
+ osd.12 is down since epoch 24, last address 192.168.106.220:6806/11861
+
+If we want to get placement group 2.5 back online, for example, this tells us that
+it was last managed by ``osd.0`` and ``osd.2``. Restarting those ``ceph-osd``
+daemons will allow the cluster to recover that placement group (and, presumably,
+many others).
+
+
+Stuck Placement Groups
+======================
+
+It is normal for placement groups to enter states like "degraded" or "peering"
+following a failure. Normally these states indicate the normal progression
+through the failure recovery process. However, if a placement group stays in one
+of these states for a long time this may be an indication of a larger problem.
+For this reason, the monitor will warn when placement groups get "stuck" in a
+non-optimal state. Specifically, we check for:
+
+* ``inactive`` - The placement group has not been ``active`` for too long
+ (i.e., it hasn't been able to service read/write requests).
+
+* ``unclean`` - The placement group has not been ``clean`` for too long
+ (i.e., it hasn't been able to completely recover from a previous failure).
+
+* ``stale`` - The placement group status has not been updated by a ``ceph-osd``,
+ indicating that all nodes storing this placement group may be ``down``.
+
+You can explicitly list stuck placement groups with one of::
+
+ ceph pg dump_stuck stale
+ ceph pg dump_stuck inactive
+ ceph pg dump_stuck unclean
+
+For stuck ``stale`` placement groups, it is normally a matter of getting the
+right ``ceph-osd`` daemons running again. For stuck ``inactive`` placement
+groups, it is usually a peering problem (see :ref:`failures-osd-peering`). For
+stuck ``unclean`` placement groups, there is usually something preventing
+recovery from completing, like unfound objects (see
+:ref:`failures-osd-unfound`);
+
+
+.. _failures-osd-peering:
+
+Placement Group Down - Peering Failure
+======================================
+
+In certain cases, the ``ceph-osd`` `Peering` process can run into
+problems, preventing a PG from becoming active and usable. For
+example, ``ceph health`` might report::
+
+ ceph health detail
+ HEALTH_ERR 7 pgs degraded; 12 pgs down; 12 pgs peering; 1 pgs recovering; 6 pgs stuck unclean; 114/3300 degraded (3.455%); 1/3 in osds are down
+ ...
+ pg 0.5 is down+peering
+ pg 1.4 is down+peering
+ ...
+ osd.1 is down since epoch 69, last address 192.168.106.220:6801/8651
+
+We can query the cluster to determine exactly why the PG is marked ``down`` with::
+
+ ceph pg 0.5 query
+
+.. code-block:: javascript
+
+ { "state": "down+peering",
+ ...
+ "recovery_state": [
+ { "name": "Started\/Primary\/Peering\/GetInfo",
+ "enter_time": "2012-03-06 14:40:16.169679",
+ "requested_info_from": []},
+ { "name": "Started\/Primary\/Peering",
+ "enter_time": "2012-03-06 14:40:16.169659",
+ "probing_osds": [
+ 0,
+ 1],
+ "blocked": "peering is blocked due to down osds",
+ "down_osds_we_would_probe": [
+ 1],
+ "peering_blocked_by": [
+ { "osd": 1,
+ "current_lost_at": 0,
+ "comment": "starting or marking this osd lost may let us proceed"}]},
+ { "name": "Started",
+ "enter_time": "2012-03-06 14:40:16.169513"}
+ ]
+ }
+
+The ``recovery_state`` section tells us that peering is blocked due to
+down ``ceph-osd`` daemons, specifically ``osd.1``. In this case, we can start that ``ceph-osd``
+and things will recover.
+
+Alternatively, if there is a catastrophic failure of ``osd.1`` (e.g., disk
+failure), we can tell the cluster that it is ``lost`` and to cope as
+best it can.
+
+.. important:: This is dangerous in that the cluster cannot
+ guarantee that the other copies of the data are consistent
+ and up to date.
+
+To instruct Ceph to continue anyway::
+
+ ceph osd lost 1
+
+Recovery will proceed.
+
+
+.. _failures-osd-unfound:
+
+Unfound Objects
+===============
+
+Under certain combinations of failures Ceph may complain about
+``unfound`` objects::
+
+ ceph health detail
+ HEALTH_WARN 1 pgs degraded; 78/3778 unfound (2.065%)
+ pg 2.4 is active+degraded, 78 unfound
+
+This means that the storage cluster knows that some objects (or newer
+copies of existing objects) exist, but it hasn't found copies of them.
+One example of how this might come about for a PG whose data is on ceph-osds
+1 and 2:
+
+* 1 goes down
+* 2 handles some writes, alone
+* 1 comes up
+* 1 and 2 repeer, and the objects missing on 1 are queued for recovery.
+* Before the new objects are copied, 2 goes down.
+
+Now 1 knows that these object exist, but there is no live ``ceph-osd`` who
+has a copy. In this case, IO to those objects will block, and the
+cluster will hope that the failed node comes back soon; this is
+assumed to be preferable to returning an IO error to the user.
+
+First, you can identify which objects are unfound with::
+
+ ceph pg 2.4 list_missing [starting offset, in json]
+
+.. code-block:: javascript
+
+ { "offset": { "oid": "",
+ "key": "",
+ "snapid": 0,
+ "hash": 0,
+ "max": 0},
+ "num_missing": 0,
+ "num_unfound": 0,
+ "objects": [
+ { "oid": "object 1",
+ "key": "",
+ "hash": 0,
+ "max": 0 },
+ ...
+ ],
+ "more": 0}
+
+If there are too many objects to list in a single result, the ``more``
+field will be true and you can query for more. (Eventually the
+command line tool will hide this from you, but not yet.)
+
+Second, you can identify which OSDs have been probed or might contain
+data::
+
+ ceph pg 2.4 query
+
+.. code-block:: javascript
+
+ "recovery_state": [
+ { "name": "Started\/Primary\/Active",
+ "enter_time": "2012-03-06 15:15:46.713212",
+ "might_have_unfound": [
+ { "osd": 1,
+ "status": "osd is down"}]},
+
+In this case, for example, the cluster knows that ``osd.1`` might have
+data, but it is ``down``. The full range of possible states include::
+
+ * already probed
+ * querying
+ * osd is down
+ * not queried (yet)
+
+Sometimes it simply takes some time for the cluster to query possible
+locations.
+
+It is possible that there are other locations where the object can
+exist that are not listed. For example, if a ceph-osd is stopped and
+taken out of the cluster, the cluster fully recovers, and due to some
+future set of failures ends up with an unfound object, it won't
+consider the long-departed ceph-osd as a potential location to
+consider. (This scenario, however, is unlikely.)
+
+If all possible locations have been queried and objects are still
+lost, you may have to give up on the lost objects. This, again, is
+possible given unusual combinations of failures that allow the cluster
+to learn about writes that were performed before the writes themselves
+are recovered. To mark the "unfound" objects as "lost"::
+
+ ceph pg 2.5 mark_unfound_lost revert
+
+This the final argument specifies how the cluster should deal with
+lost objects. Currently the only supported option is "revert", which
+will either roll back to a previous version of the object or (if it
+was a new object) forget about it entirely. Use this with caution, as
+it may confuse applications that expected the object to exist.
+
+
+
+Slow or Unresponsive OSD
+========================
+
+If, for some reason, a ``ceph-osd`` is slow to respond to a request, it will
+generate log messages complaining about requests that are taking too
+long. The warning threshold defaults to 30 seconds, and is configurable
+via the ``osd op complaint time`` option. When this happens, the cluster
+log will receive messages like::
+
+ osd.0 192.168.106.220:6800/18813 312 : [WRN] old request osd_op(client.5099.0:790 fatty_26485_object789 [write 0~4096] 2.5e54f643) v4 received at 2012-03-06 15:42:56.054801 currently waiting for sub ops
+
+Possible causes include:
+
+ * bad disk (check ``dmesg`` output)
+ * kernel file system bug (check ``dmesg`` output)
+ * overloaded cluster (check system load, iostat, etc.)
+ * ceph-osd bug
+
+
+Flapping OSDs
+=============
+
+If something is causing OSDs to "flap" (repeatedly getting marked ``down`` and then
+``up`` again), you can force the monitors to stop with::
+
+ ceph osd set noup # prevent osds from getting marked up
+ ceph osd set nodown # prevent osds from getting marked down
+
+These flags are recorded in the osdmap structure::
+
+ ceph osd dump | grep flags
+ flags no-up,no-down
+
+You can clear the flags with::
+
+ ceph osd unset noup
+ ceph osd unset nodown
+
+Two other flags are supported, ``noin`` and ``noout``, which prevent
+booting OSDs from being marked ``in`` (allocated data) or down
+ceph-osds from eventually being marked ``out`` (regardless of what the
+current value for ``mon osd down out interval`` is).
+
+Note that ``noup``, ``noout``, and ``noout`` are temporary in the
+sense that once the flags are cleared, the action they were blocking
+should occur shortly after. The ``noin`` flag, on the other hand,
+prevents ceph-osds from being marked in on boot, and any daemons that
+started while the flag was set will remain that way.
--- /dev/null
+=================
+ Troubleshooting
+=================
+
+When monitoring your cluster, you may receive health warnings and you may also
+notice that not all of your daemons are running properly. The following
+sections will help you identify and resolve daemon operations issues.
+
+.. toctree::
+
+ OSD Failures <troubleshooting-osd>
+ MON Failures <troubleshooting-mon>
\ No newline at end of file