]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: leave PG registered (and stray) during delete; reimplement pg deletion
authorSage Weil <sage@redhat.com>
Sat, 9 Dec 2017 20:42:10 +0000 (14:42 -0600)
committerSage Weil <sage@redhat.com>
Thu, 11 Jan 2018 23:07:00 +0000 (17:07 -0600)
A lot of awkward complexity is implemented in OSD to handle PGs that aren't in
pg_map and are in the process of being deleted.  This is hard because if the
PG is recreated (or split, or whatever) then we need to stop the deletion and
create a fresh PG referencing the same data.

Instead, leave deleting PGs registered and Stray, with a new peering state
Stray/Deleting.  Let them continue to process OSDMaps, splits, peering intervals,
and so on.  If they are not fully deleted, they'll go back to Reset -> Stray and
so on and the new primary will get the notify and decide what to do with them
(usually instruct them to delete again).

This (1) streamlines and cleans up the code structure, and also (2) gets rid of
the special purpose RemoveWQ and moves the delete work into the main op_wq
where it can get throttled and so on.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/legacy_config_opts.h
src/common/options.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/OpQueueItem.cc
src/osd/OpQueueItem.h
src/osd/PG.cc
src/osd/PG.h
src/osd/PrimaryLogPG.cc
src/osd/mClockOpClassSupport.cc
src/osd/mClockOpClassSupport.h

index f2b8196ef8e2d98fc0f518bcea407ddd4b800945..231a24396196ab0d411e87bff6fba97ced25d7f5 100644 (file)
@@ -653,6 +653,9 @@ OPTION(osd_op_queue_mclock_recov_lim, OPT_DOUBLE)
 OPTION(osd_op_queue_mclock_scrub_res, OPT_DOUBLE)
 OPTION(osd_op_queue_mclock_scrub_wgt, OPT_DOUBLE)
 OPTION(osd_op_queue_mclock_scrub_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_pg_delete_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_pg_delete_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_pg_delete_lim, OPT_DOUBLE)
 OPTION(osd_op_queue_mclock_peering_event_res, OPT_DOUBLE)
 OPTION(osd_op_queue_mclock_peering_event_wgt, OPT_DOUBLE)
 OPTION(osd_op_queue_mclock_peering_event_lim, OPT_DOUBLE)
@@ -870,6 +873,9 @@ OPTION(osd_scrub_cost, OPT_U32)
 // requested scrubs jump the queue of scheduled scrubs
 OPTION(osd_requested_scrub_priority, OPT_U32)
 
+OPTION(osd_pg_delete_priority, OPT_U32)
+OPTION(osd_pg_delete_cost, OPT_U32) // set default cost equal to 1MB io
+
 OPTION(osd_recovery_priority, OPT_U32)
 // set default cost equal to 20MB io
 OPTION(osd_recovery_cost, OPT_U32)
index e7d767bb9eaaeb06276a5dfcf8cc493e29dc6ffc..12b7e18ff1b8c75adb1bc3cf8f81b45c39c6bb6e 100644 (file)
@@ -2357,6 +2357,66 @@ std::vector<Option> get_global_options() {
     .add_see_also("osd_op_queue_mclock_scrub_wgt")
     .add_see_also("osd_op_queue_mclock_scrub_lim"),
 
+    Option("osd_op_queue_mclock_pg_delete_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock reservation of pg delete work")
+    .set_long_description("mclock reservation of pg delete work when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_pg_delete_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("mclock weight of pg delete work")
+    .set_long_description("mclock weight of pg delete work when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_pg_delete_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.001)
+    .set_description("mclock weight of pg delete work limit requests")
+    .set_long_description("mclock weight of limit pg delete work when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt"),
+
     Option("osd_op_queue_mclock_peering_event_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(0.0)
     .set_description("mclock reservation of peering events")
@@ -3128,6 +3188,14 @@ std::vector<Option> get_global_options() {
     .set_default(1<<20)
     .set_description(""),
 
+    Option("osd_pg_delete_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("osd_pg_delete_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1<<20)
+    .set_description(""),
+
     Option("osd_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(5)
     .set_description(""),
index b851b63dd61fb0fb8d99279d574975268d72e943..a732f9ee5dfe24eac08321ddc96e5d44e86cbbf8 100644 (file)
@@ -1747,6 +1747,26 @@ void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
       epoch));
 }
 
+void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
+{
+  dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
+  enqueue_back(
+    OpQueueItem(
+      unique_ptr<OpQueueItem::OpQueueable>(
+       new PGDelete(pgid, e)),
+      cct->_conf->osd_pg_delete_cost,
+      cct->_conf->osd_pg_delete_priority,
+      ceph_clock_now(),
+      0,
+      e));
+}
+
+void OSDService::finish_pg_delete(PG *pg)
+{
+  osd->op_shardedwq.clear_pg_pointer(pg);
+  pg_remove_epoch(pg->get_pgid());
+}
+
 void OSDService::_queue_for_recovery(
   std::pair<epoch_t, PGRef> p,
   uint64_t reserved_pushes)
@@ -2218,7 +2238,9 @@ will start to track new ops received afterwards.";
       for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
           it != pg_map.end();
           ++it) {
-
+       if (it->second->is_deleted()) {
+         continue;
+       }
         list<obj_watch_item_t> pg_watchers;
         PG *pg = it->second;
         pg->get_watchers(&pg_watchers);
@@ -3441,6 +3463,9 @@ int OSD::shutdown()
     for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
         p != pg_map.end();
         ++p) {
+      if (p->second->is_deleted()) {
+       continue;
+      }
       dout(20) << " kicking pg " << p->first << dendl;
       p->second->lock();
       if (p->second->get_num_ref() != 1) {
@@ -3809,8 +3834,6 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
     }
     peering_wait_for_split.erase(to_wake);
   }
-  if (!service.get_osdmap()->have_pg_pool(pg->pg_id.pool()))
-    _remove_pg(pg);
 }
 
 OSD::res_result OSD::_try_resurrect_pg(
@@ -3907,14 +3930,38 @@ PG *OSD::_create_lock_pg(
 
 PG *OSD::_lookup_lock_pg(spg_t pgid)
 {
-  RWLock::RLocker l(pg_map_lock);
-
-  auto pg_map_entry = pg_map.find(pgid);
-  if (pg_map_entry == pg_map.end())
+  while (true) {
+    {
+      RWLock::RLocker l(pg_map_lock);
+      auto p = pg_map.find(pgid);
+      if (p == pg_map.end()) {
+       return nullptr;
+      }
+      PG *pg = p->second;
+      pg->lock();
+      if (!pg->is_deleted()) {
+       return pg;
+      }
+      pg->unlock();
+    }
+    // try again, this time with a write lock
+    {
+      RWLock::WLocker l(pg_map_lock);
+      auto p = pg_map.find(pgid);
+      if (p == pg_map.end()) {
+       return nullptr;
+      }
+      PG *pg = p->second;
+      pg->lock();
+      if (!pg->is_deleted()) {
+       return pg;
+      }
+      pg_map.erase(p);
+      pg->put("PGMap");
+      pg->unlock();
+    }
     return nullptr;
-  PG *pg = pg_map_entry->second;
-  pg->lock();
-  return pg;
+  }
 }
 
 PG *OSD::lookup_lock_pg(spg_t pgid)
@@ -4584,6 +4631,9 @@ void OSD::maybe_update_heartbeat_peers()
     for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
         i != pg_map.end();
         ++i) {
+      if (i->second->is_deleted()) {
+       continue;
+      }
       PG *pg = i->second;
       pg->with_heartbeat_peers([&](int peer) {
          if (osdmap->is_up(peer)) {
@@ -6381,6 +6431,9 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
     RWLock::RLocker l(pg_map_lock);
     for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
         pg_map_e != pg_map.end(); ++pg_map_e) {
+      if (pg_map_e->second->is_deleted()) {
+       continue;
+      }
       PG *pg = pg_map_e->second;
       string s = stringify(pg->pg_id);
       f->open_array_section(s.c_str());
@@ -6908,8 +6961,12 @@ void OSD::handle_scrub(MOSDScrub *m)
   if (m->scrub_pgs.empty()) {
     for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
         p != pg_map.end();
-        ++p)
+        ++p) {
+      if (p->second->is_deleted()) {
+       continue;
+      }
       handle_pg_scrub(m, p->second);
+    }
   } else {
     for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
         p != m->scrub_pgs.end();
@@ -7873,12 +7930,10 @@ void OSD::advance_pg(
   set<PGRef> *new_pgs)
 {
   assert(pg->is_locked());
-  epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
   OSDMapRef lastmap = pg->get_osdmap();
-
   assert(lastmap->get_epoch() < osd_epoch);
 
-  for (;
+  for (epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
        next_epoch <= osd_epoch;
        ++next_epoch) {
     OSDMapRef nextmap = service.try_get_map(next_epoch);
@@ -7933,7 +7988,6 @@ void OSD::consume_map()
   }
 
   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
-  list<PGRef> to_remove;
 
   // scan pg's
   {
@@ -7942,6 +7996,9 @@ void OSD::consume_map()
         it != pg_map.end();
         ++it) {
       PG *pg = it->second;
+      if (pg->is_deleted()) {
+       continue;
+      }
       pg->lock();
       if (pg->is_primary())
         num_pg_primary++;
@@ -7950,12 +8007,7 @@ void OSD::consume_map()
       else
         num_pg_stray++;
 
-      if (!osdmap->have_pg_pool(pg->pg_id.pool())) {
-        //pool is deleted!
-        to_remove.push_back(PGRef(pg));
-      } else {
-        service.init_splits_between(it->first, service.get_osdmap(), osdmap);
-      }
+      service.init_splits_between(it->first, service.get_osdmap(), osdmap);
 
       pg->unlock();
     }
@@ -7971,15 +8023,6 @@ void OSD::consume_map()
     }
   }
 
-  for (list<PGRef>::iterator i = to_remove.begin();
-       i != to_remove.end();
-       to_remove.erase(i++)) {
-    RWLock::WLocker locker(pg_map_lock);
-    (*i)->lock();
-    _remove_pg(&**i);
-    (*i)->unlock();
-  }
-
   service.expand_pg_num(service.get_osdmap(), osdmap);
 
   service.pre_publish_map(osdmap);
@@ -8005,6 +8048,9 @@ void OSD::consume_map()
         it != pg_map.end();
         ++it) {
       PG *pg = it->second;
+      if (pg->is_deleted()) {
+       continue;
+      }
       pg->lock();
       pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
       pg->unlock();
@@ -8936,32 +8982,16 @@ void OSD::handle_pg_remove(OpRequestRef op)
       continue;
     }
 
-    RWLock::WLocker l(pg_map_lock);
-    if (pg_map.count(pgid) == 0) {
-      dout(10) << " don't have pg " << pgid << dendl;
-      continue;
-    }
-    dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
-    PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
-    pg_history_t history = pg->get_history();
-    int up_primary, acting_primary;
-    vector<int> up, acting;
-    osdmap->pg_to_up_acting_osds(
-      pgid.pgid, &up, &up_primary, &acting, &acting_primary);
-    bool valid_history = project_pg_history(
-      pg->pg_id, history, pg->get_osdmap()->get_epoch(),
-      up, up_primary, acting, acting_primary);
-    if (valid_history &&
-        history.same_interval_since <= m->get_epoch()) {
-      assert(pg->get_primary().osd == m->get_source().num());
-      PGRef _pg(pg);
-      _remove_pg(pg);
+    PG *pg = _lookup_lock_pg(pgid);
+    if (pg) {
+      pg->queue_peering_event(
+       PGPeeringEventRef(
+         new PGPeeringEvent(
+           m->get_epoch(), m->get_epoch(),
+           PG::DeleteStart())));
       pg->unlock();
     } else {
-      dout(10) << *pg << " ignoring remove request, pg changed in epoch "
-              << history.same_interval_since
-              << " > " << m->get_epoch() << dendl;
-      pg->unlock();
+      dout(10) << " don't have pg " << pgid << dendl;
     }
   }
 }
@@ -8994,7 +9024,7 @@ void OSD::_remove_pg(PG *pg)
   service.pg_remove_epoch(pg->pg_id);
 
   // dereference from op_wq
-  op_shardedwq.clear_pg_pointer(pg->pg_id);
+  //op_shardedwq.clear_pg_pointer(pg->pg_id);
 
   // remove from map
   pg_map.erase(pg->pg_id);
@@ -9335,10 +9365,6 @@ void OSD::dequeue_peering_evt(
   PGPeeringEventRef evt,
   ThreadPool::TPHandle& handle)
 {
-  if (pg->is_deleting()) {
-    pg->unlock();
-    return;
-  }
   auto curmap = service.get_osdmap();
   PG::RecoveryCtx rctx = create_context();
   set<PGRef> split_pgs;
@@ -9351,10 +9377,24 @@ void OSD::dequeue_peering_evt(
   if (!split_pgs.empty()) {
     rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
     split_pgs.clear();
- }
 }
   dispatch_context_transaction(rctx, pg, &handle);
+  bool deleted = pg->is_deleted();
   pg->unlock();
 
+  if (deleted) {
+    RWLock::WLocker l(pg_map_lock);
+    auto p = pg_map.find(pg->get_pgid());
+    if (p != pg_map.end() &&
+       p->second == pg) {
+      dout(20) << __func__ << " removed pg " << pg << " from pg_map" << dendl;
+      pg_map.erase(p);
+      pg->put("PGMap");
+    } else {
+      dout(20) << __func__ << " failed to remove pg " << pg << " from pg_map" << dendl;
+    }
+  }
+
   if (need_up_thru) {
     queue_want_up_thru(same_interval_since);
   }
@@ -9363,6 +9403,22 @@ void OSD::dequeue_peering_evt(
   service.send_pg_temp();
 }
 
+void OSD::dequeue_delete(
+  PG *pg,
+  epoch_t e,
+  ThreadPool::TPHandle& handle)
+{
+  dequeue_peering_evt(
+    pg,
+    PGPeeringEventRef(
+      new PGPeeringEvent(
+       e, e,
+       PG::DeleteSome())),
+    handle);
+}
+
+
+
 // --------------------------------
 
 const char** OSD::get_tracked_conf_keys() const
@@ -9833,16 +9889,17 @@ void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
   }
 }
 
-void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
+void OSD::ShardedOpWQ::clear_pg_pointer(PG *pg)
 {
+  spg_t pgid = pg->get_pgid();
   uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
   auto sdata = shard_list[shard_index];
   Mutex::Locker l(sdata->sdata_op_ordering_lock);
   auto p = sdata->pg_slots.find(pgid);
   if (p != sdata->pg_slots.end()) {
     auto& slot = p->second;
-    dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
-    assert(!slot.pg || slot.pg->is_deleting());
+    assert(!slot.pg || slot.pg == pg);
+    dout(20) << __func__ << " " << pgid << " pg " << pg << " cleared" << dendl;
     slot.pg = nullptr;
   }
 }
@@ -9922,10 +9979,19 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
   osd->service.maybe_inject_dispatch_delay();
 
   // [lookup +] lock pg (if we have it)
-  if (!pg) {
-    pg = osd->_lookup_lock_pg(token);
-  } else {
-    pg->lock();
+  while (true) {
+    if (!pg) {
+      pg = osd->_lookup_lock_pg(token);
+    } else {
+      pg->lock();
+      if (pg->is_deleted()) {
+       dout(20) << __func__ << " got deleted pg " << pg << ", retrying" << dendl;
+       pg->unlock();
+       pg = nullptr;
+       continue;
+      }
+    }
+    break;
   }
 
   osd->service.maybe_inject_dispatch_delay();
@@ -9960,7 +10026,7 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
     sdata->sdata_op_ordering_lock.Unlock();
     return;
   }
-  if (pg && !slot.pg && !pg->is_deleting()) {
+  if (pg && !slot.pg) {
     dout(20) << __func__ << " " << token << " set pg to " << pg << dendl;
     slot.pg = pg;
   }
index 07eb73a81a0ce2af614d58c8915ddfd8bdfeef1d..f1e7cb3d3647cd480caa0f73882244140ad40cea 100644 (file)
@@ -883,6 +883,8 @@ public:
   AsyncReserver<spg_t> snap_reserver;
   void queue_for_snap_trim(PG *pg);
   void queue_for_scrub(PG *pg, bool with_high_priority);
+  void queue_for_pg_delete(spg_t pgid, epoch_t e);
+  void finish_pg_delete(PG *pg);
 
 private:
   // -- pg recovery and associated throttling --
@@ -1626,6 +1628,7 @@ private:
   friend class PGOpItem;
   friend class PGPeeringItem;
   friend class PGRecovery;
+  friend class PGDelete;
 
   class ShardedOpWQ
     : public ShardedThreadPool::ShardedWQ<OpQueueItem>
@@ -1737,7 +1740,7 @@ private:
     void prune_pg_waiters(OSDMapRef osdmap, int whoami);
 
     /// clear cached PGRef on pg deletion
-    void clear_pg_pointer(spg_t pgid);
+    void clear_pg_pointer(PG *pg);
 
     /// clear pg_slots on shutdown
     void clear_pg_slots();
@@ -1842,6 +1845,11 @@ private:
     PGPeeringEventRef ref,
     ThreadPool::TPHandle& handle);
 
+  void dequeue_delete(
+    PG *pg,
+    epoch_t epoch,
+    ThreadPool::TPHandle& handle);
+
   friend class PG;
   friend class PrimaryLogPG;
 
index fc7011c13284d5e9738da34fed19aa2f35b5b989..5e2bda20b8709a5cca4a99979a97ad0f9e325dde 100644 (file)
@@ -55,4 +55,11 @@ void PGRecovery::run(OSD *osd,
   pg->unlock();
 }
 
+void PGDelete::run(
+  OSD *osd,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  osd->dequeue_delete(pg.get(), epoch_queued, handle);
+}
 
index b34414b1211552736afce745a6439b0e939e59bc..ac7401334e420cd89c26993933ba6a9725b37b80 100644 (file)
@@ -44,7 +44,8 @@ public:
       peering_event,
       bg_snaptrim,
       bg_recovery,
-      bg_scrub
+      bg_scrub,
+      bg_pg_delete
     };
     using Ref = std::unique_ptr<OpQueueable>;
 
@@ -71,6 +72,7 @@ public:
     friend ostream& operator<<(ostream& out, const OpQueueable& q) {
       return q.print(out);
     }
+
   };
 
 private:
@@ -276,3 +278,23 @@ public:
   virtual void run(
     OSD *osd, PGRef& pg, ThreadPool::TPHandle &handle) override final;
 };
+
+class PGDelete : public PGOpQueueable {
+  epoch_t epoch_queued;
+public:
+  PGDelete(
+    spg_t pg,
+    epoch_t epoch_queued)
+    : PGOpQueueable(pg),
+      epoch_queued(epoch_queued) {}
+  op_type_t get_op_type() const override final {
+    return op_type_t::bg_pg_delete;
+  }
+  ostream &print(ostream &rhs) const override final {
+    return rhs << "PGDelete(" << get_pgid()
+              << " e" << epoch_queued
+              << ")";
+  }
+  void run(
+    OSD *osd, PGRef& pg, ThreadPool::TPHandle &handle) override final;
+};
index 5154ccae799195c27044e2617e5e339c7e2d9ea8..0cac8a686bcf4b5b0cda3a645327069411da7e01 100644 (file)
@@ -5608,6 +5608,9 @@ ostream& operator<<(ostream& out, const PG& pg)
   out << " r=" << pg.get_role();
   out << " lpr=" << pg.get_last_peering_reset();
 
+  if (pg.deleting)
+    out << " DELETING";
+
   if (!pg.past_intervals.empty()) {
     out << " pi=[" << pg.past_intervals.get_bounds()
        << ")/" << pg.past_intervals.size();
@@ -6034,6 +6037,87 @@ void PG::update_store_on_load()
   }
 }
 
+
+void PG::_delete_some()
+{
+  dout(10) << __func__ << dendl;
+
+  // this ensures we get a valid result.  it *also* serves to throttle
+  // us a bit (on filestore) because we won't delete more until the
+  // previous deletions are applied.
+  osr->flush();
+
+  vector<ghobject_t> olist;
+  ObjectStore::Transaction t;
+  int max = std::min(osd->store->get_ideal_list_max(),
+                    (int)cct->_conf->osd_target_transaction_size);
+  ghobject_t next;
+  osd->store->collection_list(
+    coll,
+    next,
+    ghobject_t::get_max(),
+    max,
+    &olist,
+    &next);
+  dout(20) << __func__ << " " << olist << dendl;
+
+  OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+  int64_t num = 0;
+  for (auto& oid : olist) {
+    if (oid.is_pgmeta()) {
+      continue;
+    }
+    int r = snap_mapper.remove_oid(oid.hobj, &_t);
+    if (r != 0 && r != -ENOENT) {
+      ceph_abort();
+    }
+    t.remove(coll, oid);
+    ++num;
+  }
+  epoch_t e = get_osdmap()->get_epoch();
+  if (num) {
+    dout(20) << __func__ << " deleting " << num << " objects" << dendl;
+    struct C_DeleteMore : public Context {
+      PGRef pg;
+      epoch_t epoch;
+      C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
+      void finish(int r) {
+       if (r >= 0) {
+         pg->lock();
+         if (!pg->pg_has_reset_since(epoch)) {
+           pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
+         }
+         pg->unlock();
+       }
+      }
+    };
+    osd->store->queue_transaction(
+      osr.get(),
+      std::move(t),
+      new C_DeleteMore(this, e));
+  } else {
+    dout(20) << __func__ << " finished" << dendl;
+    if (cct->_conf->osd_inject_failure_on_pg_removal) {
+      _exit(1);
+    }
+
+    ObjectStore::Transaction t;
+    PGLog::clear_info_log(info.pgid, &t);
+    t.remove_collection(coll);
+    int r = osd->store->queue_transaction(
+      osd->meta_osr.get(), std::move(t),
+      // keep pg ref around until txn completes to avoid any issues
+      // with Sequencer lifecycle (seen w/ filestore).
+      new ContainerContext<PGRef>(this));
+    assert(r == 0);
+
+    osd->finish_pg_delete(this);
+    deleted = true;
+  }
+}
+
+
+
 /*------------ Recovery State Machine----------------*/
 #undef dout_prefix
 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
@@ -7760,10 +7844,16 @@ PG::RecoveryState::Stray::Stray(my_context ctx)
   assert(!pg->is_peered());
   assert(!pg->is_peering());
   assert(!pg->is_primary());
-  pg->start_flush(
-    context< RecoveryMachine >().get_cur_transaction(),
-    context< RecoveryMachine >().get_on_applied_context_list(),
-    context< RecoveryMachine >().get_on_safe_context_list());
+
+  if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) {
+    ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl;
+    post_event(DeleteStart());
+  } else {
+    pg->start_flush(
+      context< RecoveryMachine >().get_cur_transaction(),
+      context< RecoveryMachine >().get_on_applied_context_list(),
+      context< RecoveryMachine >().get_on_safe_context_list());
+  }
 }
 
 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
@@ -7861,6 +7951,36 @@ void PG::RecoveryState::Stray::exit()
   pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
 }
 
+
+/*--------Deleting----------*/
+PG::RecoveryState::Deleting::Deleting(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< RecoveryMachine >().pg, "Started/Deleting")
+{
+  context< RecoveryMachine >().log_enter(state_name);
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->deleting = true;
+  ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
+  pg->on_removal(t);
+  pg->osd->logger->inc(l_osd_pg_removing);
+  pg->osd->queue_for_pg_delete(pg->get_pgid(), pg->get_osdmap()->get_epoch());
+}
+
+boost::statechart::result PG::RecoveryState::Deleting::react(const DeleteSome& evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->_delete_some();
+  return discard_event();
+}
+
+void PG::RecoveryState::Deleting::exit()
+{
+  context< RecoveryMachine >().log_exit(state_name, enter_time);
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->deleting = false;
+  pg->osd->logger->dec(l_osd_pg_removing);
+}
+
 /*--------GetInfo---------*/
 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
   : my_base(ctx),
index a76bc54fdc7dbba86d6864081a1cfc7785889d60..c19363323d9038ac60596bbe1984c77a04a98780 100644 (file)
@@ -319,6 +319,9 @@ public:
   bool is_deleting() const {
     return deleting;
   }
+  bool is_deleted() const {
+    return deleted;
+  }
   bool is_replica() const {
     return role > 0;
   }
@@ -327,7 +330,7 @@ public:
   }
   bool pg_has_reset_since(epoch_t e) {
     assert(is_locked());
-    return deleting || e < get_last_peering_reset();
+    return deleted || e < get_last_peering_reset();
   }
 
   bool is_ec_pg() const {
@@ -476,6 +479,7 @@ public:
 
   virtual void on_removal(ObjectStore::Transaction *t) = 0;
 
+  void _delete_some();
   void pg_remove_object(const ghobject_t& oid, ObjectStore::Transaction *t);
 
   // reference counting
@@ -570,6 +574,7 @@ protected:
 
 
   bool deleting;  // true while in removing or OSD is shutting down
+  bool deleted = false;
 
   ZTracer::Endpoint trace_endpoint;
 
@@ -1863,6 +1868,9 @@ protected:
 
   TrivialEvent(IntervalFlush)
 
+  TrivialEvent(DeleteStart)
+  TrivialEvent(DeleteSome)
+
   /* Encapsulates PG recovery process */
   class RecoveryState {
     void start_handle(RecoveryCtx *new_ctx);
@@ -1973,6 +1981,8 @@ protected:
     //       RepWaitBackfillReserved
     //       RepWaitRecoveryReserved
     //     Stray
+    //     Deleting
+    // Crashed
 
     struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
       explicit Crashed(my_context ctx);
@@ -2270,6 +2280,7 @@ protected:
       void exit();
     };
 
+    struct Deleting;
     struct RepNotRecovering;
     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
       explicit ReplicaActive(my_context ctx);
@@ -2287,7 +2298,8 @@ protected:
        boost::statechart::custom_reaction< UnfoundRecovery >,
        boost::statechart::custom_reaction< UnfoundBackfill >,
        boost::statechart::custom_reaction< RemoteBackfillPreempted >,
-       boost::statechart::custom_reaction< RemoteRecoveryPreempted >
+       boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+       boost::statechart::transition<DeleteStart, Deleting>
        > reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const MInfoRec& infoevt);
@@ -2438,9 +2450,8 @@ protected:
       void exit();
     };
 
-    struct Stray : boost::statechart::state< Stray, Started >, NamedState {
-      map<int, pair<pg_query_t, epoch_t> > pending_queries;
-
+    struct Stray : boost::statechart::state< Stray, Started >,
+             NamedState {
       explicit Stray(my_context ctx);
       void exit();
 
@@ -2449,7 +2460,8 @@ protected:
        boost::statechart::custom_reaction< MLogRec >,
        boost::statechart::custom_reaction< MInfoRec >,
        boost::statechart::custom_reaction< ActMap >,
-       boost::statechart::custom_reaction< RecoveryDone >
+       boost::statechart::custom_reaction< RecoveryDone >,
+       boost::statechart::transition<DeleteStart, Deleting>
        > reactions;
       boost::statechart::result react(const MQuery& query);
       boost::statechart::result react(const MLogRec& logevt);
@@ -2460,6 +2472,19 @@ protected:
       }
     };
 
+    struct Deleting : boost::statechart::state<Deleting, Started>, NamedState {
+      typedef boost::mpl::list <
+       boost::statechart::custom_reaction< ActMap >,
+       boost::statechart::custom_reaction< DeleteSome >
+       > reactions;
+      explicit Deleting(my_context ctx);
+      boost::statechart::result react(const ActMap &evt) {
+       return discard_event();
+      }
+      boost::statechart::result react(const DeleteSome &evt);
+      void exit();
+    };
+
     struct GetLog;
 
     struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
@@ -2556,7 +2581,6 @@ protected:
       void exit();
     };
 
-
     RecoveryMachine machine;
     PG *pg;
 
index a1f2c054537e50a5490c382b5ba6b4f4f9e8082c..fa2cae7666f901e33cb4010768da1f2e688512b9 100644 (file)
@@ -11503,15 +11503,13 @@ void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
   pg_log.reset_backfill();
   dirty_info = true;
 
-
   // clear log
   PGLogEntryHandler rollbacker{this, t};
   pg_log.roll_forward(&rollbacker);
 
-  write_if_dirty(*t);
-
-  if (!deleting)
+  if (!deleting) {
     on_shutdown();
+  }
 }
 
 void PrimaryLogPG::clear_async_reads()
index 3849ad9c680350c2f49ba758f2db2ec35f5afb08..5ff4fd76d113fe3b268b89382fcfb43e77c09130 100644 (file)
@@ -39,6 +39,9 @@ namespace ceph {
       scrub(cct->_conf->osd_op_queue_mclock_scrub_res,
            cct->_conf->osd_op_queue_mclock_scrub_wgt,
            cct->_conf->osd_op_queue_mclock_scrub_lim),
+      pg_delete(cct->_conf->osd_op_queue_mclock_pg_delete_res,
+           cct->_conf->osd_op_queue_mclock_pg_delete_wgt,
+           cct->_conf->osd_op_queue_mclock_pg_delete_lim),
       peering_event(cct->_conf->osd_op_queue_mclock_peering_event_res,
                    cct->_conf->osd_op_queue_mclock_peering_event_wgt,
                    cct->_conf->osd_op_queue_mclock_peering_event_lim)
index a33e57a5d6168cea4158f936d59dd9302d980902..1ea1043eb08457ab7b79ec7c0c252a09cbd81bbf 100644 (file)
@@ -28,7 +28,8 @@ namespace ceph {
     using op_item_type_t = OpQueueItem::OpQueueable::op_type_t;
     
     enum class osd_op_type_t {
-      client_op, osd_rep_op, bg_snaptrim, bg_recovery, bg_scrub, peering_event
+      client_op, osd_rep_op, bg_snaptrim, bg_recovery, bg_scrub, bg_pg_delete,
+      peering_event
     };
 
     class OpClassClientInfoMgr {
@@ -37,6 +38,7 @@ namespace ceph {
       crimson::dmclock::ClientInfo snaptrim;
       crimson::dmclock::ClientInfo recov;
       crimson::dmclock::ClientInfo scrub;
+      crimson::dmclock::ClientInfo pg_delete;
       crimson::dmclock::ClientInfo peering_event;
 
       static constexpr std::size_t rep_op_msg_bitset_size = 128;
@@ -60,6 +62,8 @@ namespace ceph {
          return &recov;
        case osd_op_type_t::bg_scrub:
          return &scrub;
+       case osd_op_type_t::bg_pg_delete:
+         return &pg_delete;
        case osd_op_type_t::peering_event:
          return &peering_event;
        default:
@@ -80,6 +84,8 @@ namespace ceph {
          return osd_op_type_t::bg_recovery;
        case op_item_type_t::bg_scrub:
          return osd_op_type_t::bg_scrub;
+       case op_item_type_t::bg_pg_delete:
+         return osd_op_type_t::bg_pg_delete;
        case op_item_type_t::peering_event:
          return osd_op_type_t::peering_event;
        default: