From: Samuel Just Date: Tue, 26 Aug 2014 21:13:57 +0000 (-0700) Subject: ReplicatedPG:start_flush send a second delete X-Git-Tag: v0.85~3^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F2348%2Fhead;p=ceph.git ReplicatedPG:start_flush send a second delete Suppose we start with the following in the cache pool: 30:[29,21,20,15,10,4]:[22(21), 15(15,10), 4(4)]+head The object doesn't exist at 29 or 20. First, we flush 4 leaving the backing pool with: 3:[]+head Then, we begin to flush 15 with a delete with snapc 4:[4] leaving the backing pool with: 4:[4]:[4(4)] Then, we finish flushing 15 with snapc 9:[4] with leaving the backing pool with: 9:[4]:[4(4)]+head Next, snaps 10 and 15 are removed causing clone 10 to be removed leaving the cache with: 30:[29,21,20,4]:[22(21),4(4)]+head We next begin to flush 22 by sending a delete with snapc 4(4) since prev_snapc is 4 <---------- here is the bug The backing pool ignores this request since 4 < 9 (ORDERSNAP) leaving it with: 9:[4]:[4(4)] Then, we complete flushing 22 with snapc 19:[4] leaving the backing pool with: 19:[4]:[4(4)]+head Then, we begin to flush head by deleting with snapc 22:[21,20,4] leaving the backing pool with: 22[21,20,4]:[22(21,20), 4(4)] Finally, we flush head leaving the backing pool with: 30:[29,21,20,4]:[22(21*,20*),4(4)]+head When we go to flush clone 22, all we know is that 22 is dirty, has snaps [21], and 4 is clean. As part of flushing 22, we need to do two things: 1) Ensure that the current head is cloned as cloneid 4 with snaps [4] by sending a delete at snapc 4:[4]. 2) Flush the data at snap sequence < 21 by sending a copyfrom with snapc 20:[20,4]. Unfortunately, it is possible that 1, 1&2, or 1 and part of the flush process for some other now non-existent clone have already been performed. Because of that, between 1) and 2), we need to send a second delete ensuring that the object does not exist at 20. Fixes: #9054 Backport: firefly Signed-off-by: Samuel Just --- diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index b18ddeb8ed5b..eb4ea5606ebb 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6263,7 +6263,19 @@ int ReplicatedPG::start_flush( cancel_flush(fop, false); } - // construct a SnapContext appropriate for this clone/head + /** + * In general, we need to send two deletes and a copyfrom. + * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)] + * where 4 is marked as clean. To flush 10, we have to: + * 1) delete 4:[4,3,2] -- ensure head is created at cloneid 4 + * 2) delete (8-1):[4,3,2] -- ensure that the object does not exist at 8 + * 3) copyfrom 8:[8,4,3,2] -- flush object excluding snap 8 + * + * The second delete is required in case at some point in the past + * there had been a clone 7(7,6), which we had flushed. Without + * the second delete, the object would appear in the base pool to + * have existed. + */ SnapContext dsnapc; dsnapc.seq = 0; SnapContext snapc; @@ -6295,17 +6307,15 @@ int ReplicatedPG::start_flush( vector::iterator dnewest = p; // we may need to send a delete first - while (p != snapset.snaps.end() && *p > prev_snapc) - ++p; - dsnapc.snaps = vector(p, snapset.snaps.end()); + if (prev_snapc + 1 < *dnewest) { + while (p != snapset.snaps.end() && *p > prev_snapc) + ++p; + dsnapc.snaps = vector(p, snapset.snaps.end()); - if (p == dnewest) { - // no snaps between the oldest in this clone and prev_snapc - snapc.seq = prev_snapc; - } else { - // snaps between oldest in this clone and prev_snapc, send delete dsnapc.seq = prev_snapc; snapc.seq = oi.snaps.back() - 1; + } else { + snapc.seq = prev_snapc; } } @@ -6328,6 +6338,40 @@ int ReplicatedPG::start_flush( NULL, NULL /* no callback, we'll rely on the ordering w.r.t the next op */); osd->objecter_lock.Unlock(); + + // do we need to send the second delete? + SnapContext dsnapc2; + vector::reverse_iterator rp = snapset.snaps.rbegin(); + + // advance rp to the smallest snap not contained by the last flushed clone + while (rp != snapset.snaps.rend() && *rp <= dsnapc.seq) + ++rp; + + // set dnsnapc2.seq to be the snap prior to that snap (the object did not + // exist at *rq, so it must have been deleted prior to that). + dsnapc2.seq = (rp == snapset.snaps.rend()) ? snapset.seq : *rp; + if (dsnapc2.seq > 0) + dsnapc2.seq.val -= 1; + + if (dsnapc2.seq != dsnapc.seq) { + dsnapc2.snaps = dsnapc.snaps; + + ObjectOperation o2; + o2.remove(); + osd->objecter_lock.Lock(); + osd->objecter->mutate( + soid.oid, + base_oloc, + o2, + dsnapc2, + oi.mtime, + (CEPH_OSD_FLAG_IGNORE_OVERLAY | + CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ENFORCE_SNAPC), + NULL, + NULL /* no callback, we'll rely on the ordering w.r.t the next op */); + osd->objecter_lock.Unlock(); + } } FlushOpRef fop(new FlushOp);