*
* - __CRUSH_RULE_NOOP__ do nothing.
* - __CRUSH_RULE_TAKE__ select the __arg1__ item
+ * - __CRUSH_RULE_EMIT__ append the selection to the results and clear
+ * the selection
+ *
* - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__
* recursively explore each bucket currently selected, looking for
* __arg1__ items of type __arg2__ and select them.
* recursively explore each bucket currently selected, looking for
* __arg1__ leaves within all the buckets of type __arg2__ and
* select them.
- * - __CRUSH_RULE_EMIT__ append the selection to the results and clear
- * the selection
*
* In all __CHOOSE__ steps, if __arg1__ is zero, the number of items
* to select is determined by the __max_result__ argument of
* crush_do_rule(), i.e. __arg1__ is __max_result__ minus the number of
* items already in the result.
*
+ * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
+ *
+ * The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of
+ * a given type, randomly selecting them. If they are unlucky and
+ * find the same bucket twice, they will try N+1 times (N being the
+ * value of the choose_total_tries tunable). If there is a previous
+ * SET_CHOOSE_TRIES step in the same rule, it will try C times
+ * instead (C being the value of the argument of the
+ * SET_CHOOSE_TRIES step).
+ *
+ * Note: the __choose_total_tries__ tunable defined in crush_map is
+ * the number of retry, not the number of tries. The number of tries
+ * is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the
+ * number of tries and does not need the + 1. This confusing
+ * difference is inherited from an off-by-one bug from years ago.
+ *
+ * The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same
+ * as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore
+ * each bucket found, looking for a single device. The same device
+ * may be found in two different buckets because the crush map is
+ * not a strict hierarchy, it is a DAG. When such a collision
+ * happens, they will try again. The number of times they try to
+ * find a non colliding device is:
+ *
+ * - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule
+ * step: try N + 1 times (N being the value of the
+ * __choose_total_tries__ tunable defined in crush_map)
+ *
+ * - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule
+ * step: try P times (P being the value of the argument of the
+ * SET_CHOOSELEAF_TRIES rule step)
+ *
+ * - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule
+ * step: try 1 time.
+ *
+ * - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try
+ * P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES
+ * rule step)
+ *
* @param rule the rule in which the step is inserted
* @param pos the zero based step index
- * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__ or __CRUSH_RULE_EMIT__
+ * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__
* @param arg1 first argument for __op__
* @param arg2 second argument for __op__
*/
*/
__s32 max_devices;
- /*! choose local retries before re-descent */
+ /*! Backward compatibility tunable. It implements a bad solution
+ * and must always be set to 0 except for backward compatibility
+ * purposes
+ */
__u32 choose_local_tries;
- /*! choose local attempts using a fallback permutation before
- *! re-descent */
+ /*! Backward compatibility tunable. It implements a bad solution
+ * and must always be set to 0 except for backward compatibility
+ * purposes
+ */
__u32 choose_local_fallback_tries;
- /*! choose attempts before giving up */
+ /*! Tunable. The default value when the CHOOSE_TRIES or
+ * CHOOSELEAF_TRIES steps are omitted in a rule. See the
+ * documentation for crush_rule_set_step() for more
+ * information
+ */
__u32 choose_total_tries;
- /*! attempt chooseleaf inner descent once for firstn mode; on
- *! reject retry outer descent. Note that this does *not*
- *! apply to a collision: in that case we will retry as we used
- *! to. */
+ /*! Backward compatibility tunable. It should always be set
+ * to 1 except for backward compatibility. Implemented in 2012
+ * it was generalized late 2013 and is mostly unused except
+ * in one border case, reason why it must be set to 1.
+ *
+ * Attempt chooseleaf inner descent once for firstn mode; on
+ * reject retry outer descent. Note that this does *not*
+ * apply to a collision: in that case we will retry as we
+ * used to.
+ */
__u32 chooseleaf_descend_once;
-
- /*! if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
- *! bits. a value of 1 is best for new clusters. for legacy clusters
- *! that want to limit reshuffling, a value of 3 or 4 will make the
- *! mappings line up a bit better with previous mappings. */
+ /*! Backward compatibility tunable. It is a fix for bad
+ * mappings implemented in 2014 at
+ * https://github.com/ceph/ceph/pull/1185. It should always
+ * be set to 1 except for backward compatibility.
+ *
+ * If non-zero, feed r into chooseleaf, bit-shifted right by
+ * (r-1) bits. a value of 1 is best for new clusters. for
+ * legacy clusters that want to limit reshuffling, a value of
+ * 3 or 4 will make the mappings line up a bit better with
+ * previous mappings.
+ */
__u8 chooseleaf_vary_r;
- /*! if true, it makes chooseleaf firstn to return stable results (if
- *! no local retry) so that data migrations would be optimal when some
- *! device fails. */
+ /*! Backward compatibility tunable. It is an improvement that
+ * avoids unnecessary mapping changes, implemented at
+ * https://github.com/ceph/ceph/pull/6572 and explained in
+ * this post: "chooseleaf may cause some unnecessary pg
+ * migrations" in October 2015
+ * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
+ * It should always be set to 1 except for backward compatibility.
+ */
__u8 chooseleaf_stable;
/*! @cond INTERNAL */
size_t working_size;
#ifndef __KERNEL__
- /*
- * version 0 (original) of straw_calc has various flaws. version 1
- * fixes a few of them.
+ /*! @endcond */
+ /*! Backward compatibility tunable. It is a fix for the straw
+ * scaler values for the straw algorithm which is deprecated
+ * (straw2 replaces it) implemented at
+ * https://github.com/ceph/ceph/pull/3057. It should always
+ * be set to 1 except for backward compatibility.
+ *
*/
__u8 straw_calc_version;
+ /*! @cond INTERNAL */
/*
* allowed bucket algs is a bitmask, here the bit positions
* are CRUSH_BUCKET_*. note that these are *bits* and