rgw: add some comments to rgw code to help explain functionality

author J. Eric Ivancich <ivancich@redhat.com>

Tue, 30 Apr 2019 18:01:19 +0000 (14:01 -0400)

committer J. Eric Ivancich <ivancich@redhat.com>

Fri, 3 May 2019 19:29:59 +0000 (15:29 -0400)
author J. Eric Ivancich <ivancich@redhat.com>
Tue, 30 Apr 2019 18:01:19 +0000 (14:01 -0400)
committer J. Eric Ivancich <ivancich@redhat.com>
Fri, 3 May 2019 19:29:59 +0000 (15:29 -0400)
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc

index a332737427704c0689dcc92f1bda98b7bb4b9833..ed68d2bc5ca342e445c524646dda5bfffb949246 100644 (file)
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -214,11 +214,13 @@ static void decreasing_str(uint64_t num, string *str)
  }
  
  /*
- * we now hold two different indexes for objects. The first one holds the list of objects in the
- * order that we want them to be listed. The second one only holds the objects instances (for
- * versioned objects), and they're not arranged in any particular order.
- * When listing objects we'll use the first index, when doing operations on the objects themselves
- * we'll use the second index. Note that regular objects only map to the first index anyway
+ * We hold two different indexes for objects. The first one holds the
+ * list of objects in the order that we want them to be listed. The
+ * second one only holds the objects instances (for versioned
+ * objects), and they're not arranged in any particular order. When
+ * listing objects we'll use the first index, when doing operations on
+ * the objects themselves we'll use the second index. Note that
+ * regular objects only map to the first index anyway
   */
  
  static void get_list_index_key(rgw_bucket_dir_entry& entry, string *index_key)
@@ -1385,17 +1387,20 @@ static int convert_plain_entry_to_versioned(cls_method_context_t hctx, cls_rgw_o
  }
  
  /*
- * link an object version to an olh, update the relevant index entries. It will also handle the
- * deletion marker case. We have a few entries that we need to take care of. For object 'foo',
+ * Link an object version to an olh, update the relevant index
+ * entries. It will also handle the deletion marker case. We have a
+ * few entries that we need to take care of. For object 'foo',
   * instance BAR, we'd update the following (not actual encoding):
+ *
   *  - olh data: [BI_BUCKET_OLH_DATA_INDEX]foo
   *  - object instance data: [BI_BUCKET_OBJ_INSTANCE_INDEX]foo,BAR
   *  - object instance list entry: foo,123,BAR
   *
- *  The instance list entry needs to be ordered by newer to older, so we generate an appropriate
- *  number string that follows the name.
- *  The top instance for each object is marked appropriately.
- *  We generate instance entry for deletion markers here, as they are not created prior.
+ *  The instance list entry needs to be ordered by newer to older, so
+ *  we generate an appropriate number string that follows the name.
+ *  The top instance for each object is marked appropriately. We
+ *  generate instance entry for deletion markers here, as they are not
+ *  created prior.
   */
  static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
  {
@@ -1433,20 +1438,25 @@ static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, buffer
        unmod.tv_nsec = 0;
      }
      if (mtime >= unmod) {
-      return 0; /* no need to set error, we just return 0 and avoid writing to the bi log */
+      return 0; /* no need tof set error, we just return 0 and avoid
+                * writing to the bi log */
      }
    }
  
    bool removing;
  
    /*
-   * Special handling for null instance object / delete-marker. For these objects we're going to
-   * have separate instances for a data object vs. delete-marker to avoid collisions. We now check
-   * if we got to overwrite a previous entry, and in that case we'll remove its list entry.
+   * Special handling for null instance object / delete-marker. For
+   * these objects we're going to have separate instances for a data
+   * object vs. delete-marker to avoid collisions. We now check if we
+   * got to overwrite a previous entry, and in that case we'll remove
+   * its list entry.
     */
    if (op.key.instance.empty()) {
      BIVerObjEntry other_obj(hctx, op.key);
-    ret = other_obj.init(!op.delete_marker); /* try reading the other null versioned entry */
+    ret = other_obj.init(!op.delete_marker); /* try reading the other
+                                             * null versioned
+                                             * entry */
      existed = (ret >= 0 && !other_obj.is_delete_marker());
      if (ret >= 0 && other_obj.is_delete_marker() != op.delete_marker) {
        ret = other_obj.unlink_list_entry();
@@ -1687,7 +1697,8 @@ static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in,
    if (!obj.is_delete_marker()) {
      olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false);
    } else {
-    /* this is a delete marker, it's our responsibility to remove its instance entry */
+    /* this is a delete marker, it's our responsibility to remove its
+     * instance entry */
      ret = obj.unlink();
      if (ret < 0) {
        return ret;
@@ -1715,7 +1726,8 @@ static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in,
      rgw_bucket_entry_ver ver;
      ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch());
  
-    real_time mtime = obj.mtime(); /* mtime has no real meaning in instance removal context */
+    real_time mtime = obj.mtime(); /* mtime has no real meaning in
+                                   * instance removal context */
      ret = log_index_operation(hctx, op.key, CLS_RGW_OP_UNLINK_INSTANCE, op.op_tag,
                                mtime, ver,
                                CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker,
@@ -3157,10 +3169,27 @@ int rgw_usage_log_clear(cls_method_context_t hctx, bufferlist *in, bufferlist *o
  
    return ret;
  }
+
  /*
- * We hold the garbage collection chain data under two different indexes: the first 'name' index
- * keeps them under a unique tag that represents the chains, and a second 'time' index keeps
- * them by their expiration timestamp
+ * We hold the garbage collection chain data under two different
+ * indexes: the first 'name' index keeps them under a unique tag that
+ * represents the chains, and a second 'time' index keeps them by
+ * their expiration timestamp. Each is prefixed differently (see
+ * gc_index_prefixes below).
+ *
+ * Since key-value data is listed in lexical order by keys, generally
+ * the name entries are retrieved first and then the time entries.
+ * When listing the entries via `gc_iterate_entries` one parameter is
+ * a marker, and if we were to pass "1_" (i.e.,
+ * gc_index_prefixes[GC_OBJ_TIME_INDEX]), the listing would skip over
+ * the 'name' entries and begin with the 'time' entries.
+ *
+ * Furthermore, the times are converted to strings such that lexical
+ * order correlates with chronological order, so the entries are
+ * returned chronologically from the earliest expiring to the latest
+ * expiring. This allows for starting at "1_" and to keep retrieving
+ * chunks of entries, and as long as they are prior to the current
+ * time, they're expired and processing can continue.
   */
  #define GC_OBJ_NAME_INDEX 0
  #define GC_OBJ_TIME_INDEX 1
diff --git a/src/rgw/rgw_zone.h b/src/rgw/rgw_zone.h

index 6f1bd64378a87cd121133c6006e2652eac585c83..19603d6caf1910a2f13d3e0ccad26930b24c1574 100644 (file)
--- a/src/rgw/rgw_zone.h
+++ b/src/rgw/rgw_zone.h
@@ -975,9 +975,30 @@ struct RGWPeriodLatestEpochInfo {
  };
  WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo)
  
+
+/*
+ * The RGWPeriod object contains the entire configuration of a
+ * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of
+ * this configuration is maintained across all zones by passing around
+ * the RGWPeriod object in its JSON representation.
+ *
+ * If a new configuration changes which zone is the metadata master
+ * zone (i.e., master zone of the master zonegroup), then a new
+ * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is
+ * incremented, and the RGWRealm object is updated to reflect that new
+ * current_period id and epoch. If the configuration changes BUT which
+ * zone is the metadata master does NOT change, then only the
+ * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the
+ * same).
+ *
+ * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it
+ * is linked back to its predecessor RGWPeriod through the
+ * RGWPeriod::predecessor_uuid field, thus creating a "linked
+ * list"-like structure of RGWPeriods back to the cluster's creation.
+ */
  class RGWPeriod
  {
-  std::string id;
+  std::string id; //< a uuid
    epoch_t epoch{0};
    std::string predecessor_uuid;
    std::vector<std::string> sync_status;
author	J. Eric Ivancich <ivancich@redhat.com>
	Tue, 30 Apr 2019 18:01:19 +0000 (14:01 -0400)
committer	J. Eric Ivancich <ivancich@redhat.com>
	Fri, 3 May 2019 19:29:59 +0000 (15:29 -0400)
src/cls/rgw/cls_rgw.cc		patch \| blob \| history
src/rgw/rgw_zone.h		patch \| blob \| history