From: Samuel Just Date: Mon, 8 Sep 2014 03:13:41 +0000 (-0700) Subject: ReplicatedPG:start_flush send a second delete X-Git-Tag: v0.80.6~4^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fd96eb62ece27f5c660429584c2ff2e058bc6e94;p=ceph.git ReplicatedPG:start_flush send a second delete Suppose we start with the following in the cache pool: 30:[29,21,20,15,10,4]:[22(21), 15(15,10), 4(4)]+head The object doesn't exist at 29 or 20. First, we flush 4 leaving the backing pool with: 3:[]+head Then, we begin to flush 15 with a delete with snapc 4:[4] leaving the backing pool with: 4:[4]:[4(4)] Then, we finish flushing 15 with snapc 9:[4] with leaving the backing pool with: 9:[4]:[4(4)]+head Next, snaps 10 and 15 are removed causing clone 10 to be removed leaving the cache with: 30:[29,21,20,4]:[22(21),4(4)]+head We next begin to flush 22 by sending a delete with snapc 4(4) since prev_snapc is 4 <---------- here is the bug The backing pool ignores this request since 4 < 9 (ORDERSNAP) leaving it with: 9:[4]:[4(4)] Then, we complete flushing 22 with snapc 19:[4] leaving the backing pool with: 19:[4]:[4(4)]+head Then, we begin to flush head by deleting with snapc 22:[21,20,4] leaving the backing pool with: 22[21,20,4]:[22(21,20), 4(4)] Finally, we flush head leaving the backing pool with: 30:[29,21,20,4]:[22(21*,20*),4(4)]+head When we go to flush clone 22, all we know is that 22 is dirty, has snaps [21], and 4 is clean. As part of flushing 22, we need to do two things: 1) Ensure that the current head is cloned as cloneid 4 with snaps [4] by sending a delete at snapc 4:[4]. 2) Flush the data at snap sequence < 21 by sending a copyfrom with snapc 20:[20,4]. Unfortunately, it is possible that 1, 1&2, or 1 and part of the flush process for some other now non-existent clone have already been performed. Because of that, between 1) and 2), we need to send a second delete ensuring that the object does not exist at 20. Fixes: #9054 Backport: firefly Related: 66c7439ea0888777b5cfc08bcb0fbd7bfd8653c3 Signed-off-by: Samuel Just (cherry picked from commit 4843fd510b33a71999cdf9c2cfa2b4c318fa80fd) --- diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 43cb8c948ce2..f7c97ad52335 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6183,56 +6183,53 @@ int ReplicatedPG::start_flush( cancel_flush(fop, false); } - // construct a SnapContext appropriate for this clone/head - SnapContext dsnapc; - dsnapc.seq = 0; - SnapContext snapc; - if (soid.snap == CEPH_NOSNAP) { - snapc.seq = snapset.seq; - snapc.snaps = snapset.snaps; + /** + * In general, we need to send two deletes and a copyfrom. + * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)] + * where 4 is marked as clean. To flush 10, we have to: + * 1) delete 4:[4,3,2] -- ensure head is created at cloneid 4 + * 2) delete (8-1):[4,3,2] -- ensure that the object does not exist at 8 + * 3) copyfrom 8:[8,4,3,2] -- flush object excluding snap 8 + * + * The second delete is required in case at some point in the past + * there had been a clone 7(7,6), which we had flushed. Without + * the second delete, the object would appear in the base pool to + * have existed. + */ - if (!snapset.clones.empty() && snapset.clones.back() != snapset.seq) { - dsnapc.seq = snapset.clones.back(); - vector::iterator p = snapset.snaps.begin(); - while (p != snapset.snaps.end() && *p > dsnapc.seq) - ++p; - dsnapc.snaps = vector(p, snapset.snaps.end()); + SnapContext snapc, dsnapc, dsnapc2; + if (snapset.seq != 0) { + if (soid.snap == CEPH_NOSNAP) { + snapc.seq = snapset.seq; + snapc.snaps = snapset.snaps; + } else { + snapid_t min_included_snap = oi.snaps.back(); + snapc = snapset.get_ssc_as_of(min_included_snap - 1); } - } else { - vector::iterator citer = std::find( - snapset.clones.begin(), - snapset.clones.end(), - soid.snap); - assert(citer != snapset.clones.end()); - snapid_t prev_snapc = (citer == snapset.clones.begin()) ? - snapid_t(0) : *(citer - 1); - - vector::iterator p = snapset.snaps.begin(); - while (p != snapset.snaps.end() && *p >= oi.snaps.back()) - ++p; - snapc.snaps = vector(p, snapset.snaps.end()); - vector::iterator dnewest = p; - - // we may need to send a delete first - while (p != snapset.snaps.end() && *p > prev_snapc) - ++p; - dsnapc.snaps = vector(p, snapset.snaps.end()); + snapid_t prev_snapc = 0; + for (vector::reverse_iterator citer = snapset.clones.rbegin(); + citer != snapset.clones.rend(); + ++citer) { + if (*citer < soid.snap) { + prev_snapc = *citer; + break; + } + } - if (p == dnewest) { - // no snaps between the oldest in this clone and prev_snapc - snapc.seq = prev_snapc; - } else { - // snaps between oldest in this clone and prev_snapc, send delete - dsnapc.seq = prev_snapc; - snapc.seq = oi.snaps.back() - 1; + if (prev_snapc != snapc.seq) { + dsnapc = snapset.get_ssc_as_of(prev_snapc); + snapid_t first_snap_after_prev_snapc = + snapset.get_first_snap_after(prev_snapc, snapc.seq); + dsnapc2 = snapset.get_ssc_as_of( + first_snap_after_prev_snapc - 1); } } object_locator_t base_oloc(soid); base_oloc.pool = pool.info.tier_of; - if (dsnapc.seq > 0) { + if (dsnapc.seq > 0 && dsnapc.seq < snapc.seq) { ObjectOperation o; o.remove(); osd->objecter_lock.Lock(); @@ -6250,6 +6247,22 @@ int ReplicatedPG::start_flush( osd->objecter_lock.Unlock(); } + if (dsnapc2.seq > dsnapc.seq && dsnapc2.seq < snapc.seq) { + ObjectOperation o; + o.remove(); + osd->objecter->mutate( + soid.oid, + base_oloc, + o, + dsnapc2, + oi.mtime, + (CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC), + NULL, + NULL /* no callback, we'll rely on the ordering w.r.t the next op */); + } + FlushOpRef fop(new FlushOp); fop->obc = obc; fop->flushed_version = oi.user_version; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 371052397b2b..8e9cf6f534b7 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2535,6 +2535,29 @@ struct SnapSet { void decode(bufferlist::iterator& bl); void dump(Formatter *f) const; static void generate_test_instances(list& o); + + SnapContext get_ssc_as_of(snapid_t as_of) const { + SnapContext out; + out.seq = as_of; + for (vector::const_iterator i = snaps.begin(); + i != snaps.end(); + ++i) { + if (*i <= as_of) + out.snaps.push_back(*i); + } + return out; + } + + // return min element of snaps > after, return max if no such element + snapid_t get_first_snap_after(snapid_t after, snapid_t max) const { + for (vector::const_reverse_iterator i = snaps.rbegin(); + i != snaps.rend(); + ++i) { + if (*i > after) + return *i; + } + return max; + } }; WRITE_CLASS_ENCODER(SnapSet)