From aad5d47be64ed7feba79f540ec1debc45625a74f Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Sat, 20 Apr 2019 16:34:12 +0800 Subject: [PATCH] osd/PG: fix last_complete re-calculation on splitting We add hard-limit for pg_logs now, which means we might keep trimming old log entries irrespective of pg's current missing_set. This as a result can cause the last_complete pointer moving far ahead of the real on-disk version (the oldest need of missing_set, for instance) the corresponding pg should have on splitting: ``` 2019-04-19 06:41:52.559247 7efd4725c700 10 osd.2 271 Splitting pg[5.6( v 270'943 lc 0'0 (238'300,270'943] local-lis/les=250/251 n=943 ec=223/223 lis/c 250/223 les/251/224/0 250/271/229) [5,2] r=1 lpr=271 pi=[223,271)/4 crt=270'943 unknown NOTIFY m=518 mbc={}] into 5.16 2019-04-19 06:41:52.561413 7efd4725c700 10 osd.2 pg_epoch: 271 pg[5.6( v 270'943 lc 238'300 (238'300,270'943] local-lis/les=250/251 n=943 ec=223/223 lis/c 250/223 c/f 251/224/0 250/271/229) [5,2] r=1 lpr=271 pi=[223,271)/4 crt=270'943 lcod 0'0 unknown NOTIFY m=261 mbc={}] release_backoffs [MIN,MAX) ``` For the above example, parent's last_complete cursor changed from **0'0** to **238'300** directly due to the effort of trying to catch up the oldest log entry changing when splitting was done. However, back into v12.2.9 primary would still reference shard's last_complete field when trying to figure out all possible locations of a currently missing object (see PG::MissingLoc::add_source_info): ```c++ if (oinfo.last_complete < need) { if (omissing.is_missing(soid)) { ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need << " also missing on osd." << fromosd << dendl; continue; } } ``` Hence a wrongly calculated last_complete could then make primary mis-consider that a specific shard might have the authoritative object it currently looking for: ``` 2019-04-19 06:41:52.904163 7fd4cfb5a700 10 osd.5 pg_epoch: 271 pg[5.6( v 270'943 lc 238'300 (238'300,270'943] local-lis/les=250/251 n=471 ec=223/223 lis/c 250/223 les/ c/f 251/224/0 250/271/229) [5,2] r=0 lpr=271 pi=[223,271)/4 crt=270'943 lcod 226'77 mlcod 0'0 peering m=16 mbc={}] proc_replica_log for osd.2: 5.6( v 270'943 lc 238'30 0 (238'300,270'943] local-lis/les=250/251 n=471 ec=223/223 lis/c 250/223 les/c/f 251/224/0 250/271/229) log((249'563,270'943], crt=270'943) missing(261 may_include_del etes = 1) 2019-04-19 06:41:52.904645 7fd4cfb5a700 20 osd.5 pg_epoch: 271 pg[5.6( v 270'943 lc 238'300 (238'300,270'943] local-lis/les=250/251 n=471 ec=223/223 lis/c 250/223 les/ c/f 251/224/0 250/271/229) [5,2] r=0 lpr=271 pi=[223,271)/4 crt=270'943 lcod 226'77 mlcod 0'0 peering m=16 mbc={}] after missing 5:624c3a7a:::benchmark_data_smithi190 _39968_object1382:head need 226'110 have 0'0 2019-04-19 06:41:53.567820 7fd4d035b700 10 osd.5 pg_epoch: 272 pg[5.6( v 270'943 lc 0'0 (238'300,270'943] local-lis/les=271/272 n=471 ec=223/223 lis/c 250/223 les/c/f 251/224/0 250/271/229) [5,2] r=0 lpr=271 pi=[223,271)/4 crt=270'943 lcod 226'77 mlcod 0'0 unknown m=16 u=13 mbc={255={(1+0)=220,(2+0)=28}}] search_for_missing 5:624c3a 7a:::benchmark_data_smithi190_39968_object1382:head 226'110 is on osd.2 ``` note that ```5:624c3a7a:::benchmark_data_smithi190_39968_object1382:head 226'110``` was actually missing on both primary and shard osd.2 whereas primary insisted that object should exist on shard osd.2! https://github.com/ceph/ceph/pull/26175 posted an indirect fix for the above problem by ignoring last_complete when checking the missing set, but it should generally make more sense to fill in the last_complete field correctly whenever possible. Hence coming this additional fix. Fixes: http://tracker.ceph.com/issues/26958 Signed-off-by: xie xingguo --- src/osd/PG.cc | 7 +++---- src/osd/PGLog.h | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index ffdce25ab22..4f1872cf493 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2624,10 +2624,9 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) info.log_tail = pg_log.get_tail(); child->info.log_tail = child->pg_log.get_tail(); - if (info.last_complete < pg_log.get_tail()) - info.last_complete = pg_log.get_tail(); - if (child->info.last_complete < child->pg_log.get_tail()) - child->info.last_complete = child->pg_log.get_tail(); + // reset last_complete, we might have modified pg_log & missing above + pg_log.reset_complete_to(&info); + child->pg_log.reset_complete_to(&child->info); // Info child->info.history = info.history; diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index 6b492785e9e..f02b4c4c23d 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -797,6 +797,8 @@ public: } void reset_complete_to(pg_info_t *info) { + if (log.log.empty()) // caller is split_into() + return; log.complete_to = log.log.begin(); ceph_assert(log.complete_to != log.log.end()); auto oldest_need = missing.get_oldest_need(); -- 2.39.5