The PG may be doing work relative to a different epoch than what the osd
has. Make sure the PG removal message is queued under that epoch to avoid
confusing/crashing the recipient like so:
2012-02-10 23:26:35.691793
7f387281f700 osd.3 514 queue_pg_for_deletion: 0.0
osd/OSD.cc: In function 'void OSD::handle_pg_remove(OpRequest*)' thread
7f387281f700 time 2012-02-10 23:26:35.691820
osd/OSD.cc: 4860: FAILED assert(pg->get_primary() == m->get_source().num())
Signed-off-by: Sage Weil <sage@newdream.net>
Mutex remove_list_lock;
map<epoch_t, map<int, vector<pg_t> > > remove_list;
- void queue_for_removal(int osd, pg_t pgid) {
+ void queue_for_removal(epoch_t epoch, int osd, pg_t pgid) {
remove_list_lock.Lock();
- remove_list[osdmap->get_epoch()][osd].push_back(pgid);
+ remove_list[epoch][osd].push_back(pgid);
remove_list_lock.Unlock();
}
p++) {
if (get_osdmap()->is_up(*p)) {
dout(10) << "sending PGRemove to osd." << *p << dendl;
- osd->queue_for_removal(*p, info.pgid);
+ osd->queue_for_removal(get_osdmap()->get_epoch(), *p, info.pgid);
stray_purged.insert(*p);
} else {
dout(10) << "not sending PGRemove to down osd." << *p << dendl;