]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/OSDMonitor: limit number of concurrently creating pgs
authorSage Weil <sage@redhat.com>
Thu, 18 May 2017 19:41:39 +0000 (15:41 -0400)
committerSage Weil <sage@redhat.com>
Fri, 2 Jun 2017 17:02:44 +0000 (13:02 -0400)
There is overhead for PGs we are creating because the mon has to track
which OSD each one current maps to.  This can be problematic on a very
large cluster.  Limit the overhead by setting a cap on the number of PGs
we are creating at once; leave the rest in a persistent queue.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/config_opts.h
src/mon/CreatingPGs.h
src/mon/OSDMonitor.cc

index 7bfe8c0864af4d8a8f6077aea907d245c61f9dbf..85552efba63835f40be98d4cafcb432469552da8 100644 (file)
@@ -271,6 +271,7 @@ OPTION(mon_osd_cache_size, OPT_INT, 10)  // the size of osdmaps cache, not to re
 
 OPTION(mon_cpu_threads, OPT_INT, 4)
 OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT, 4096)
+OPTION(mon_osd_max_creating_pgs, OPT_INT, 512)
 OPTION(mon_tick_interval, OPT_INT, 5)
 OPTION(mon_session_timeout, OPT_INT, 300)    // must send keepalive or subscribe
 OPTION(mon_subscribe_interval, OPT_DOUBLE, 24*3600)  // for legacy clients only
index 46aaa7040772a04f1cb7c1d441627efe976e1000..f2d8ed2869747460c3b7f9c2a3cc6465f4c6a8ae 100644 (file)
@@ -9,25 +9,48 @@
 
 struct creating_pgs_t {
   epoch_t last_scan_epoch = 0;
+
+  /// pgs we are currently creating
   std::map<pg_t, std::pair<epoch_t, utime_t> > pgs;
-  std::set<int64_t> created_pools;
 
-  unsigned create_pool(int64_t poolid, uint32_t pg_num,
-                      epoch_t created, utime_t modified) {
-    if (created_pools.count(poolid)) {
-      return 0;
+  struct create_info {
+    epoch_t created;
+    utime_t modified;
+    uint64_t start = 0;
+    uint64_t end = 0;
+    bool done() const {
+      return start >= end;
     }
-    const unsigned total = pgs.size();
-    for (ps_t ps = 0; ps < pg_num; ps++) {
-      const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
-      if (pgs.count(pgid)) {
-       continue;
-      }
-      pgs.emplace(pgid, make_pair(created, modified));
+    void encode(bufferlist& bl) const {
+      ::encode(created, bl);
+      ::encode(modified, bl);
+      ::encode(start, bl);
+      ::encode(end, bl);
     }
-    return pgs.size() - total;
-  }
+    void decode(bufferlist::iterator& p) {
+      ::decode(created, p);
+      ::decode(modified, p);
+      ::decode(start, p);
+      ::decode(end, p);
+    }
+  };
+
+  /// queue of pgs we still need to create (poolid -> <created, set of ps>)
+  map<int64_t,create_info> queue;
 
+  /// pools that exist in the osdmap for which at least one pg has been created
+  std::set<int64_t> created_pools;
+
+  void create_pool(int64_t poolid, uint32_t pg_num,
+                  epoch_t created, utime_t modified) {
+    if (created_pools.count(poolid) == 0) {
+      auto& c = queue[poolid];
+      c.created = created;
+      c.modified = modified;
+      c.end = pg_num;
+      created_pools.insert(poolid);
+    }
+  }
   unsigned remove_pool(int64_t removed_pool) {
     const unsigned total = pgs.size();
     auto first = pgs.lower_bound(pg_t{0, (uint64_t)removed_pool});
@@ -37,22 +60,25 @@ struct creating_pgs_t {
     return total - pgs.size();
   }
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(last_scan_epoch, bl);
     ::encode(pgs, bl);
     ::encode(created_pools, bl);
+    ::encode(queue, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     ::decode(last_scan_epoch, bl);
     ::decode(pgs, bl);
     ::decode(created_pools, bl);
+    if (struct_v >= 2)
+      ::decode(queue, bl);
     DECODE_FINISH(bl);
   }
   void dump(ceph::Formatter *f) const {
-    f->open_object_section("creating_pgs");
     f->dump_unsigned("last_scan_epoch", last_scan_epoch);
+    f->open_array_section("creating_pgs");
     for (auto& pg : pgs) {
       f->open_object_section("pg");
       f->dump_stream("pgid") << pg.first;
@@ -61,6 +87,17 @@ struct creating_pgs_t {
       f->close_section();
     }
     f->close_section();
+    f->open_array_section("queue");
+    for (auto& p : queue) {
+      f->open_object_section("pool");
+      f->dump_unsigned("pool", p.first);
+      f->dump_unsigned("created", p.second.created);
+      f->dump_stream("modified") << p.second.modified;
+      f->dump_unsigned("ps_start", p.second.start);
+      f->dump_unsigned("ps_end", p.second.end);
+      f->close_section();
+    }
+    f->close_section();
     f->open_array_section("created_pools");
     for (auto pool : created_pools) {
       f->dump_unsigned("pool", pool);
@@ -81,4 +118,5 @@ struct creating_pgs_t {
     o.push_back(c);
   }
 };
+WRITE_CLASS_ENCODER(creating_pgs_t::create_info);
 WRITE_CLASS_ENCODER(creating_pgs_t);
index 9aa7ea6df5e74061346df8657fa11d7ff97240a3..4fc513a319795f7446636829ff3e67f6042c7402 100644 (file)
@@ -942,51 +942,86 @@ void OSDMonitor::create_pending()
 creating_pgs_t
 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
 {
+  dout(10) << __func__ << dendl;
   creating_pgs_t pending_creatings;
   {
     std::lock_guard<std::mutex> l(creating_pgs_lock);
     pending_creatings = creating_pgs;
   }
-  if (pending_creatings.last_scan_epoch > inc.epoch) {
-    return pending_creatings;
-  }
-  if (osdmap.get_epoch() &&
-      !osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-    const unsigned total = pending_creatings.pgs.size();
-    mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
-                                          &pending_creatings);
-    dout(7) << __func__ << total - pending_creatings.pgs.size()
-           << " pgs added from pgmap" << dendl;
-  }
-  unsigned added = 0;
-  added += scan_for_creating_pgs(osdmap.get_pools(),
-                                inc.old_pools,
-                                inc.modified,
-                                &pending_creatings);
-  added += scan_for_creating_pgs(inc.new_pools,
-                                inc.old_pools,
-                                inc.modified,
-                                &pending_creatings);
-  dout(10) << __func__ << added << " pg added for new pools" << dendl;
-  for (auto deleted_pool : inc.old_pools) {
-    auto removed = pending_creatings.remove_pool(deleted_pool);
-    dout(10) << __func__ << removed
-            << " pg removed because containing pool deleted: "
-            << deleted_pool << dendl;
-  }
-  // pgmon updates its creating_pgs in check_osd_map() which is called by
-  // on_active() and check_osd_map() could be delayed if lease expires, so its
-  // creating_pgs could be stale in comparison with the one of osdmon. let's
-  // trim them here. otherwise, they will be added back after being erased.
-  unsigned removed = 0;
-  for (auto& pg : pending_created_pgs) {
-    pending_creatings.created_pools.insert(pg.pool());
-    removed += pending_creatings.pgs.erase(pg);
-  }
-  pending_created_pgs.clear();
-  dout(10) << __func__ << removed << " pgs removed because they're created"
-          << dendl;
-  pending_creatings.last_scan_epoch = osdmap.get_epoch();
+  // check for new or old pools
+  if (pending_creatings.last_scan_epoch < inc.epoch) {
+    if (osdmap.get_epoch() &&
+       osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
+      const unsigned total = pending_creatings.pgs.size();
+      mon->pgservice->maybe_add_creating_pgs(creating_pgs.last_scan_epoch,
+                                            &pending_creatings);
+      dout(7) << __func__ << total - pending_creatings.pgs.size()
+             << " pgs added from pgmap" << dendl;
+    }
+    scan_for_creating_pgs(osdmap.get_pools(),
+                         inc.old_pools,
+                         inc.modified,
+                         &pending_creatings);
+    scan_for_creating_pgs(inc.new_pools,
+                         inc.old_pools,
+                         inc.modified,
+                         &pending_creatings);
+    for (auto deleted_pool : inc.old_pools) {
+      auto removed = pending_creatings.remove_pool(deleted_pool);
+      dout(10) << __func__ << removed
+               << " pg removed because containing pool deleted: "
+               << deleted_pool << dendl;
+      last_epoch_clean.remove_pool(deleted_pool);
+    }
+    // pgmon updates its creating_pgs in check_osd_map() which is called by
+    // on_active() and check_osd_map() could be delayed if lease expires, so its
+    // creating_pgs could be stale in comparison with the one of osdmon. let's
+    // trim them here. otherwise, they will be added back after being erased.
+    unsigned removed = 0;
+    for (auto& pg : pending_created_pgs) {
+      pending_creatings.created_pools.insert(pg.pool());
+      removed += pending_creatings.pgs.erase(pg);
+    }
+    pending_created_pgs.clear();
+    dout(10) << __func__ << removed << " pgs removed because they're created"
+             << dendl;
+    pending_creatings.last_scan_epoch = osdmap.get_epoch();
+  }
+
+  // process queue
+  unsigned max = MAX(1, g_conf->mon_osd_max_creating_pgs);
+  while (pending_creatings.pgs.size() < max &&
+        !pending_creatings.queue.empty()) {
+    auto p = pending_creatings.queue.begin();
+    int64_t poolid = p->first;
+    dout(10) << __func__ << " pool " << poolid
+            << " created " << p->second.created
+            << " modified " << p->second.modified
+            << " [" << p->second.start << "-" << p->second.end << ")"
+            << dendl;
+    int n = MIN(max - pending_creatings.pgs.size(),
+               p->second.end - p->second.start);
+    ps_t first = p->second.start;
+    ps_t end = first + n;
+    for (ps_t ps = first; ps < end; ++ps) {
+      const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
+      pending_creatings.pgs.emplace(pgid, make_pair(p->second.created,
+                                                   p->second.modified));
+      dout(10) << __func__ << " adding " << pgid << dendl;
+    }
+    p->second.start = end;
+    if (p->second.done()) {
+      dout(10) << __func__ << " done with queue for " << poolid << dendl;
+      pending_creatings.queue.erase(p);
+    } else {
+      dout(10) << __func__ << " pool " << poolid
+              << " now [" << p->second.start << "-" << p->second.end << ")"
+              << dendl;
+    }
+  }
+  dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
+          << " pools" << dendl;
+
   return pending_creatings;
 }
 
@@ -1337,7 +1372,6 @@ void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
       dout(20) << __func__ << " pgmap shows " << p->first << " is created"
               << dendl;
       p = creating_pgs->pgs.erase(p);
-      creating_pgs->created_pools.insert(q->first.pool());
     } else {
       ++p;
     }
@@ -3203,13 +3237,12 @@ void OSDMonitor::check_pg_creates_sub(Subscription *sub)
   }
 }
 
-unsigned OSDMonitor::scan_for_creating_pgs(
+void OSDMonitor::scan_for_creating_pgs(
   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
   const mempool::osdmap::set<int64_t>& removed_pools,
   utime_t modified,
   creating_pgs_t* creating_pgs) const
 {
-  unsigned total = 0;
   for (auto& p : pools) {
     int64_t poolid = p.first;
     const pg_pool_t& pool = p.second;
@@ -3230,17 +3263,16 @@ unsigned OSDMonitor::scan_for_creating_pgs(
               << " " << pool << dendl;
       continue;
     }
-    unsigned added = creating_pgs->create_pool(poolid, pool.get_pg_num(),
-                                              created, modified);
-    dout(10) << __func__ << added << " pgs added for pool "<< poolid
+    dout(10) << __func__ << " queueing pool create for " << poolid
             << " " << pool << dendl;
-    total += added;
+    creating_pgs->create_pool(poolid, pool.get_pg_num(), created, modified);
   }
-  return total;
 }
 
 void OSDMonitor::update_creating_pgs()
 {
+  dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
+          << creating_pgs.queue.size() << " pools in queue" << dendl;
   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
   std::lock_guard<std::mutex> l(creating_pgs_lock);
   for (auto& pg : creating_pgs.pgs) {
@@ -3264,7 +3296,10 @@ void OSDMonitor::update_creating_pgs()
            mapped = mapping.get_epoch();
           }
           break;
-        }
+        } else {
+         // newly creating
+         mapped = mapping.get_epoch();
+       }
       }
     }
     dout(10) << __func__ << " will instruct osd." << acting_primary