}
/*
- * we now hold two different indexes for objects. The first one holds the list of objects in the
- * order that we want them to be listed. The second one only holds the objects instances (for
- * versioned objects), and they're not arranged in any particular order.
- * When listing objects we'll use the first index, when doing operations on the objects themselves
- * we'll use the second index. Note that regular objects only map to the first index anyway
+ * We hold two different indexes for objects. The first one holds the
+ * list of objects in the order that we want them to be listed. The
+ * second one only holds the objects instances (for versioned
+ * objects), and they're not arranged in any particular order. When
+ * listing objects we'll use the first index, when doing operations on
+ * the objects themselves we'll use the second index. Note that
+ * regular objects only map to the first index anyway
*/
static void get_list_index_key(rgw_bucket_dir_entry& entry, string *index_key)
}
/*
- * link an object version to an olh, update the relevant index entries. It will also handle the
- * deletion marker case. We have a few entries that we need to take care of. For object 'foo',
+ * Link an object version to an olh, update the relevant index
+ * entries. It will also handle the deletion marker case. We have a
+ * few entries that we need to take care of. For object 'foo',
* instance BAR, we'd update the following (not actual encoding):
+ *
* - olh data: [BI_BUCKET_OLH_DATA_INDEX]foo
* - object instance data: [BI_BUCKET_OBJ_INSTANCE_INDEX]foo,BAR
* - object instance list entry: foo,123,BAR
*
- * The instance list entry needs to be ordered by newer to older, so we generate an appropriate
- * number string that follows the name.
- * The top instance for each object is marked appropriately.
- * We generate instance entry for deletion markers here, as they are not created prior.
+ * The instance list entry needs to be ordered by newer to older, so
+ * we generate an appropriate number string that follows the name.
+ * The top instance for each object is marked appropriately. We
+ * generate instance entry for deletion markers here, as they are not
+ * created prior.
*/
static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
unmod.tv_nsec = 0;
}
if (mtime >= unmod) {
- return 0; /* no need to set error, we just return 0 and avoid writing to the bi log */
+ return 0; /* no need tof set error, we just return 0 and avoid
+ * writing to the bi log */
}
}
bool removing;
/*
- * Special handling for null instance object / delete-marker. For these objects we're going to
- * have separate instances for a data object vs. delete-marker to avoid collisions. We now check
- * if we got to overwrite a previous entry, and in that case we'll remove its list entry.
+ * Special handling for null instance object / delete-marker. For
+ * these objects we're going to have separate instances for a data
+ * object vs. delete-marker to avoid collisions. We now check if we
+ * got to overwrite a previous entry, and in that case we'll remove
+ * its list entry.
*/
if (op.key.instance.empty()) {
BIVerObjEntry other_obj(hctx, op.key);
- ret = other_obj.init(!op.delete_marker); /* try reading the other null versioned entry */
+ ret = other_obj.init(!op.delete_marker); /* try reading the other
+ * null versioned
+ * entry */
existed = (ret >= 0 && !other_obj.is_delete_marker());
if (ret >= 0 && other_obj.is_delete_marker() != op.delete_marker) {
ret = other_obj.unlink_list_entry();
if (!obj.is_delete_marker()) {
olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false);
} else {
- /* this is a delete marker, it's our responsibility to remove its instance entry */
+ /* this is a delete marker, it's our responsibility to remove its
+ * instance entry */
ret = obj.unlink();
if (ret < 0) {
return ret;
rgw_bucket_entry_ver ver;
ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch());
- real_time mtime = obj.mtime(); /* mtime has no real meaning in instance removal context */
+ real_time mtime = obj.mtime(); /* mtime has no real meaning in
+ * instance removal context */
ret = log_index_operation(hctx, op.key, CLS_RGW_OP_UNLINK_INSTANCE, op.op_tag,
mtime, ver,
CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker,
return ret;
}
+
/*
- * We hold the garbage collection chain data under two different indexes: the first 'name' index
- * keeps them under a unique tag that represents the chains, and a second 'time' index keeps
- * them by their expiration timestamp
+ * We hold the garbage collection chain data under two different
+ * indexes: the first 'name' index keeps them under a unique tag that
+ * represents the chains, and a second 'time' index keeps them by
+ * their expiration timestamp. Each is prefixed differently (see
+ * gc_index_prefixes below).
+ *
+ * Since key-value data is listed in lexical order by keys, generally
+ * the name entries are retrieved first and then the time entries.
+ * When listing the entries via `gc_iterate_entries` one parameter is
+ * a marker, and if we were to pass "1_" (i.e.,
+ * gc_index_prefixes[GC_OBJ_TIME_INDEX]), the listing would skip over
+ * the 'name' entries and begin with the 'time' entries.
+ *
+ * Furthermore, the times are converted to strings such that lexical
+ * order correlates with chronological order, so the entries are
+ * returned chronologically from the earliest expiring to the latest
+ * expiring. This allows for starting at "1_" and to keep retrieving
+ * chunks of entries, and as long as they are prior to the current
+ * time, they're expired and processing can continue.
*/
#define GC_OBJ_NAME_INDEX 0
#define GC_OBJ_TIME_INDEX 1
};
WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo)
+
+/*
+ * The RGWPeriod object contains the entire configuration of a
+ * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of
+ * this configuration is maintained across all zones by passing around
+ * the RGWPeriod object in its JSON representation.
+ *
+ * If a new configuration changes which zone is the metadata master
+ * zone (i.e., master zone of the master zonegroup), then a new
+ * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is
+ * incremented, and the RGWRealm object is updated to reflect that new
+ * current_period id and epoch. If the configuration changes BUT which
+ * zone is the metadata master does NOT change, then only the
+ * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the
+ * same).
+ *
+ * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it
+ * is linked back to its predecessor RGWPeriod through the
+ * RGWPeriod::predecessor_uuid field, thus creating a "linked
+ * list"-like structure of RGWPeriods back to the cluster's creation.
+ */
class RGWPeriod
{
- std::string id;
+ std::string id; //< a uuid
epoch_t epoch{0};
std::string predecessor_uuid;
std::vector<std::string> sync_status;