]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Deny reservation if expected backfill size would put us over backfill_full_ratio...
authorDavid Zafman <dzafman@redhat.com>
Wed, 17 Oct 2018 19:31:59 +0000 (12:31 -0700)
committerDavid Zafman <dzafman@redhat.com>
Tue, 18 Dec 2018 17:30:44 +0000 (09:30 -0800)
Erasure Coded Pools

Fixes: http://tracker.ceph.com/issues/19753
Signed-off-by: David Zafman <dzafman@redhat.com>
src/osd/ECBackend.cc
src/osd/ECBackend.h
src/osd/PG.cc
src/osd/PG.h
src/osd/PGBackend.h
src/osd/PrimaryLogPG.cc

index bc9c4c460863eb4a5c32e9172564cf06804bd894..1e668376c806977ccbb3a65958157bb868c97aae 100644 (file)
@@ -329,6 +329,15 @@ void ECBackend::handle_recovery_push(
     ceph_assert(op.data.length() == 0);
   }
 
+  if (get_parent()->pg_is_remote_backfilling()) {
+    get_parent()->pg_add_local_num_bytes(op.data.length());
+    get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
+    dout(10) << __func__ << " " << op.soid
+             << " add new actual data by " << op.data.length()
+             << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
+             << dendl;
+  }
+
   if (op.before_progress.first) {
     ceph_assert(op.attrset.count(string("_")));
     m->t.setattrs(
@@ -365,6 +374,20 @@ void ECBackend::handle_recovery_push(
        ObjectContextRef(),
        false,
        &m->t);
+      if (get_parent()->pg_is_remote_backfilling()) {
+        struct stat st;
+        int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
+                            get_parent()->whoami_shard().shard), &st);
+        if (r == 0) {
+          get_parent()->pg_sub_local_num_bytes(st.st_size);
+         // XXX: This can be way overestimated for small objects
+         get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
+         dout(10) << __func__ << " " << op.soid
+                  << " sub actual data by " << st.st_size
+                  << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
+                  << dendl;
+        }
+      }
     }
   }
   m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());
index d8d4a71d664bffba5ff97007eabf29cf2b45e26d..6784925df9ca532fee863483b420539e3754e0e2 100644 (file)
@@ -601,6 +601,13 @@ public:
     return new ECRecPred(ec_impl);
   }
 
+  int get_ec_data_chunk_count() const override {
+    return ec_impl->get_data_chunk_count();
+  }
+  int get_ec_stripe_chunk_size() const override {
+    return sinfo.get_chunk_size();
+  }
+
   /**
    * ECReadPred
    *
index 170d6ae16f274977f8d44209b6371d0c0c15cbe2..102a7598a48d6bf08ce912f913fd9dbfec2f55b1 100644 (file)
@@ -7688,6 +7688,14 @@ PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
   int64_t primary_num_bytes = evt.primary_num_bytes;
   int64_t local_num_bytes = evt.local_num_bytes;
   if (primary_num_bytes) {
+    // For erasure coded pool overestimate by a full stripe per object
+    // because we don't know how each objected rounded to the nearest stripe
+    if (pg->pool.info.is_erasure()) {
+      primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
+      primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
+      local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
+      local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
+    }
     pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
     ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
                        << " local " << (local_num_bytes >> 10) << "KiB"
index c8df5c1e922fc11ffd0ab4401917c051550d296c..e631860d90aea22be0d8f5c0e47530419fa90b8d 100644 (file)
@@ -1280,10 +1280,18 @@ public:
   int64_t get_stats_num_bytes() {
     Mutex::Locker l(_lock);
     int num_bytes = info.stats.stats.sum.num_bytes;
+    if (pool.info.is_erasure()) {
+      num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      // Round up each object by a stripe
+      num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
+    }
     int64_t lnb = local_num_bytes.load();
     if (lnb && lnb != num_bytes) {
       lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
-                           << lnb << " vs stats " << num_bytes << dendl;
+                           << lnb << " vs stats "
+                            << info.stats.stats.sum.num_bytes << " / chunk "
+                            << get_pgbackend()->get_ec_data_chunk_count()
+                            << dendl;
     }
     return num_bytes;
   }
index 58dd6193529a5df0745e2fc2857ce21b90833544..837e2cce10f3d3aa35d59ca3ef76a6fb23a9a2b9 100644 (file)
@@ -412,6 +412,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
    virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
    virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
    virtual int get_ec_data_chunk_count() const { return 0; };
+   virtual int get_ec_stripe_chunk_size() const { return 0; };
 
    virtual void dump_recovery_info(Formatter *f) const = 0;
 
index 4b15e7e44d9083a634095b7e28473df3fb7a76fa..08a2616b3748212af8ef7f3d40e693eb0a4b3389 100644 (file)
@@ -4314,10 +4314,29 @@ void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
                                pg_whoami.shard) , &st);
       if (r == 0) {
         sub_local_num_bytes(st.st_size);
-        int chunks = 1;
-        sub_num_bytes(st.st_size * chunks);
+        int64_t usersize;
+        if (pool.info.is_erasure()) {
+          bufferlist bv;
+         int r = osd->store->getattr(
+             ch,
+              ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
+             OI_ATTR,
+             bv);
+         if (r >= 0) {
+           object_info_t oi(bv);
+            usersize = oi.size * pgbackend->get_ec_data_chunk_count();
+          } else {
+            dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
+                    << " can't get object info" << dendl;
+            usersize = 0;
+          }
+        } else {
+          usersize = st.st_size;
+        }
+        sub_num_bytes(usersize);
         dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
                  << " sub actual data by " << st.st_size
+                 << " sub num_bytes by " << usersize
                  << dendl;
       }
     }