From 8b27472bbd8a22372cd14c0c5603ee41056a3343 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Mon, 8 Nov 2021 21:24:52 -0500 Subject: [PATCH] cls/rgw: index cancelation still cleans up remove_objs when multipart uploads complete their final bucket index transaction, they pass the list of part objects in 'remove_objs' for bulk removal - the part objects, along with their bucket stats, get replaced by the head object but if CompleteMultipart races with another upload, the head object write will fail with ECANCELED and the bucket index transaction gets canceled with CLS_RGW_OP_CANCEL. these canceled uploads still need to clean up their 'remove_objs', but cancelation was returning too early. as a result, these bucket index entries get orphaned and leave the bucket stats inconsistent this commit reworks rgw_bucket_complete_op() so that CLS_RGW_OP_CANCEL is handled the same way as OP_ADD and OP_DEL, so always runs the loop to clean up 'remove_objs' Fixes: https://tracker.ceph.com/issues/53199 Signed-off-by: Casey Bodley --- src/cls/rgw/cls_rgw.cc | 105 +++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index 12f1445f807..31cef6d0679 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -1002,7 +1002,7 @@ static int complete_remove_obj(cls_method_context_t hctx, string idx; int ret = read_key_entry(hctx, key, &idx, &entry); if (ret < 0) { - CLS_LOG(1, "%s: read_index_entry name=%s instance=%s failed with %d", + CLS_LOG(1, "%s: read_key_entry name=%s instance=%s failed with %d", __func__, key.name.c_str(), key.instance.c_str(), ret); return ret; } @@ -1085,73 +1085,78 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist entry.pending_map.erase(pinter); } - bool cancel = false; - bufferlist update_bl; - if (op.tag.size() && op.op == CLS_RGW_OP_CANCEL) { CLS_LOG(1, "rgw_bucket_complete_op(): cancel requested\n"); - cancel = true; } else if (op.ver.pool == entry.ver.pool && op.ver.epoch && op.ver.epoch <= entry.ver.epoch) { CLS_LOG(1, "rgw_bucket_complete_op(): skipping request, old epoch\n"); - cancel = true; + op.op = CLS_RGW_OP_CANCEL; } - bufferlist op_bl; - if (cancel) { + // controls whether remove_objs deletions are logged + const bool default_log_op = op.log_op && !header.syncstopped; + // controls whether this operation is logged (depends on op.op and ondisk) + bool log_op = default_log_op; + + entry.ver = op.ver; + if (op.op == CLS_RGW_OP_CANCEL) { + log_op = false; // don't log cancelation if (op.tag.size()) { + // we removed this tag from pending_map so need to write the changes bufferlist new_key_bl; encode(entry, new_key_bl); - return cls_cxx_map_set_val(hctx, idx, &new_key_bl); + rc = cls_cxx_map_set_val(hctx, idx, &new_key_bl); + if (rc < 0) { + return rc; + } } - return 0; - } + } // CLS_RGW_OP_CANCEL + else if (op.op == CLS_RGW_OP_DEL) { + // unaccount deleted entry + unaccount_entry(header, entry); - unaccount_entry(header, entry); - - entry.ver = op.ver; - switch ((int)op.op) { - case CLS_RGW_OP_DEL: entry.meta = op.meta; - if (ondisk) { - if (!entry.pending_map.size()) { - int ret = cls_cxx_map_remove_key(hctx, idx); - if (ret < 0) - return ret; - } else { - entry.exists = false; - bufferlist new_key_bl; - encode(entry, new_key_bl); - int ret = cls_cxx_map_set_val(hctx, idx, &new_key_bl); - if (ret < 0) - return ret; + if (!ondisk) { + // no entry to erase + log_op = false; + } else if (!entry.pending_map.size()) { + rc = cls_cxx_map_remove_key(hctx, idx); + if (rc < 0) { + return rc; } } else { - return -ENOENT; - } - break; - case CLS_RGW_OP_ADD: - { - rgw_bucket_dir_entry_meta& meta = op.meta; - rgw_bucket_category_stats& stats = header.stats[meta.category]; - entry.meta = meta; - entry.key = op.key; - entry.exists = true; - entry.tag = op.tag; - stats.num_entries++; - stats.total_size += meta.accounted_size; - stats.total_size_rounded += cls_rgw_get_rounded_size(meta.accounted_size); - stats.actual_size += meta.size; + entry.exists = false; bufferlist new_key_bl; encode(entry, new_key_bl); - int ret = cls_cxx_map_set_val(hctx, idx, &new_key_bl); - if (ret < 0) - return ret; + rc = cls_cxx_map_set_val(hctx, idx, &new_key_bl); + if (rc < 0) { + return rc; + } } - break; - } + } // CLS_RGW_OP_DEL + else if (op.op == CLS_RGW_OP_ADD) { + // unaccount overwritten entry + unaccount_entry(header, entry); + + rgw_bucket_dir_entry_meta& meta = op.meta; + rgw_bucket_category_stats& stats = header.stats[meta.category]; + entry.meta = meta; + entry.key = op.key; + entry.exists = true; + entry.tag = op.tag; + // account for new entry + stats.num_entries++; + stats.total_size += meta.accounted_size; + stats.total_size_rounded += cls_rgw_get_rounded_size(meta.accounted_size); + stats.actual_size += meta.size; + bufferlist new_key_bl; + encode(entry, new_key_bl); + rc = cls_cxx_map_set_val(hctx, idx, &new_key_bl); + if (rc < 0) { + return rc; + } + } // CLS_RGW_OP_ADD - const bool log_op = (op.log_op && !header.syncstopped); if (log_op) { rc = log_index_operation(hctx, op.key, op.op, op.tag, entry.meta.mtime, entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, @@ -1165,7 +1170,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist CLS_LOG(20, "rgw_bucket_complete_op(): remove_objs.size()=%d", (int)op.remove_objs.size()); for (const auto& remove_key : op.remove_objs) { - rc = complete_remove_obj(hctx, header, remove_key, log_op); + rc = complete_remove_obj(hctx, header, remove_key, default_log_op); if (rc < 0) { continue; // part cleanup errors are not fatal } -- 2.39.5