From: J. Eric Ivancich Date: Tue, 30 Apr 2019 18:01:19 +0000 (-0400) Subject: rgw: add some comments to rgw code to help explain functionality X-Git-Tag: v15.1.0~2762^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=8111dd712b324df7a473706217175f3f4a0e5918;p=ceph-ci.git rgw: add some comments to rgw code to help explain functionality Comments to explain the role of RGWPeriod and the use of prefixes in gc log entries are added. Some other comments are reformatted for 80 columns. Signed-off-by: J. Eric Ivancich --- diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index a3327374277..ed68d2bc5ca 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -214,11 +214,13 @@ static void decreasing_str(uint64_t num, string *str) } /* - * we now hold two different indexes for objects. The first one holds the list of objects in the - * order that we want them to be listed. The second one only holds the objects instances (for - * versioned objects), and they're not arranged in any particular order. - * When listing objects we'll use the first index, when doing operations on the objects themselves - * we'll use the second index. Note that regular objects only map to the first index anyway + * We hold two different indexes for objects. The first one holds the + * list of objects in the order that we want them to be listed. The + * second one only holds the objects instances (for versioned + * objects), and they're not arranged in any particular order. When + * listing objects we'll use the first index, when doing operations on + * the objects themselves we'll use the second index. Note that + * regular objects only map to the first index anyway */ static void get_list_index_key(rgw_bucket_dir_entry& entry, string *index_key) @@ -1385,17 +1387,20 @@ static int convert_plain_entry_to_versioned(cls_method_context_t hctx, cls_rgw_o } /* - * link an object version to an olh, update the relevant index entries. It will also handle the - * deletion marker case. We have a few entries that we need to take care of. For object 'foo', + * Link an object version to an olh, update the relevant index + * entries. It will also handle the deletion marker case. We have a + * few entries that we need to take care of. For object 'foo', * instance BAR, we'd update the following (not actual encoding): + * * - olh data: [BI_BUCKET_OLH_DATA_INDEX]foo * - object instance data: [BI_BUCKET_OBJ_INSTANCE_INDEX]foo,BAR * - object instance list entry: foo,123,BAR * - * The instance list entry needs to be ordered by newer to older, so we generate an appropriate - * number string that follows the name. - * The top instance for each object is marked appropriately. - * We generate instance entry for deletion markers here, as they are not created prior. + * The instance list entry needs to be ordered by newer to older, so + * we generate an appropriate number string that follows the name. + * The top instance for each object is marked appropriately. We + * generate instance entry for deletion markers here, as they are not + * created prior. */ static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { @@ -1433,20 +1438,25 @@ static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, buffer unmod.tv_nsec = 0; } if (mtime >= unmod) { - return 0; /* no need to set error, we just return 0 and avoid writing to the bi log */ + return 0; /* no need tof set error, we just return 0 and avoid + * writing to the bi log */ } } bool removing; /* - * Special handling for null instance object / delete-marker. For these objects we're going to - * have separate instances for a data object vs. delete-marker to avoid collisions. We now check - * if we got to overwrite a previous entry, and in that case we'll remove its list entry. + * Special handling for null instance object / delete-marker. For + * these objects we're going to have separate instances for a data + * object vs. delete-marker to avoid collisions. We now check if we + * got to overwrite a previous entry, and in that case we'll remove + * its list entry. */ if (op.key.instance.empty()) { BIVerObjEntry other_obj(hctx, op.key); - ret = other_obj.init(!op.delete_marker); /* try reading the other null versioned entry */ + ret = other_obj.init(!op.delete_marker); /* try reading the other + * null versioned + * entry */ existed = (ret >= 0 && !other_obj.is_delete_marker()); if (ret >= 0 && other_obj.is_delete_marker() != op.delete_marker) { ret = other_obj.unlink_list_entry(); @@ -1687,7 +1697,8 @@ static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in, if (!obj.is_delete_marker()) { olh.update_log(CLS_RGW_OLH_OP_REMOVE_INSTANCE, op.op_tag, op.key, false); } else { - /* this is a delete marker, it's our responsibility to remove its instance entry */ + /* this is a delete marker, it's our responsibility to remove its + * instance entry */ ret = obj.unlink(); if (ret < 0) { return ret; @@ -1715,7 +1726,8 @@ static int rgw_bucket_unlink_instance(cls_method_context_t hctx, bufferlist *in, rgw_bucket_entry_ver ver; ver.epoch = (op.olh_epoch ? op.olh_epoch : olh.get_epoch()); - real_time mtime = obj.mtime(); /* mtime has no real meaning in instance removal context */ + real_time mtime = obj.mtime(); /* mtime has no real meaning in + * instance removal context */ ret = log_index_operation(hctx, op.key, CLS_RGW_OP_UNLINK_INSTANCE, op.op_tag, mtime, ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker, @@ -3157,10 +3169,27 @@ int rgw_usage_log_clear(cls_method_context_t hctx, bufferlist *in, bufferlist *o return ret; } + /* - * We hold the garbage collection chain data under two different indexes: the first 'name' index - * keeps them under a unique tag that represents the chains, and a second 'time' index keeps - * them by their expiration timestamp + * We hold the garbage collection chain data under two different + * indexes: the first 'name' index keeps them under a unique tag that + * represents the chains, and a second 'time' index keeps them by + * their expiration timestamp. Each is prefixed differently (see + * gc_index_prefixes below). + * + * Since key-value data is listed in lexical order by keys, generally + * the name entries are retrieved first and then the time entries. + * When listing the entries via `gc_iterate_entries` one parameter is + * a marker, and if we were to pass "1_" (i.e., + * gc_index_prefixes[GC_OBJ_TIME_INDEX]), the listing would skip over + * the 'name' entries and begin with the 'time' entries. + * + * Furthermore, the times are converted to strings such that lexical + * order correlates with chronological order, so the entries are + * returned chronologically from the earliest expiring to the latest + * expiring. This allows for starting at "1_" and to keep retrieving + * chunks of entries, and as long as they are prior to the current + * time, they're expired and processing can continue. */ #define GC_OBJ_NAME_INDEX 0 #define GC_OBJ_TIME_INDEX 1 diff --git a/src/rgw/rgw_zone.h b/src/rgw/rgw_zone.h index 6f1bd64378a..19603d6caf1 100644 --- a/src/rgw/rgw_zone.h +++ b/src/rgw/rgw_zone.h @@ -975,9 +975,30 @@ struct RGWPeriodLatestEpochInfo { }; WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo) + +/* + * The RGWPeriod object contains the entire configuration of a + * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of + * this configuration is maintained across all zones by passing around + * the RGWPeriod object in its JSON representation. + * + * If a new configuration changes which zone is the metadata master + * zone (i.e., master zone of the master zonegroup), then a new + * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is + * incremented, and the RGWRealm object is updated to reflect that new + * current_period id and epoch. If the configuration changes BUT which + * zone is the metadata master does NOT change, then only the + * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the + * same). + * + * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it + * is linked back to its predecessor RGWPeriod through the + * RGWPeriod::predecessor_uuid field, thus creating a "linked + * list"-like structure of RGWPeriods back to the cluster's creation. + */ class RGWPeriod { - std::string id; + std::string id; //< a uuid epoch_t epoch{0}; std::string predecessor_uuid; std::vector sync_status;