]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
Implement the collection hint transaction, add a new transation type as expected...
authorGuang Yang <yguang@yahoo-inc.com>
Mon, 7 Jul 2014 11:32:23 +0000 (11:32 +0000)
committerGuang Yang <yguang@yahoo-inc.com>
Tue, 19 Aug 2014 07:10:47 +0000 (07:10 +0000)
Signed-off-by: Guang Yang (yguang@yahoo-inc.com)
14 files changed:
src/os/CollectionIndex.h
src/os/FileStore.cc
src/os/FileStore.h
src/os/HashIndex.cc
src/os/HashIndex.h
src/os/KeyValueStore.cc
src/os/KeyValueStore.h
src/os/LFNIndex.cc
src/os/LFNIndex.h
src/os/MemStore.cc
src/os/MemStore.h
src/os/ObjectStore.h
src/os/Transaction.cc
src/osd/OSD.cc

index d24d257325db2fb0c1a102b3b46771d5753f4a31..734c022fbd0e8a4975406032865aa99c9d912d91 100644 (file)
@@ -182,6 +182,18 @@ protected:
 
   CollectionIndex():access_lock("CollectionIndex::access_lock"){}
 
+  /*
+   * Pre-hash the collection, this collection should map to a PG folder.
+   *
+   * @param pg_num            - pg number of the pool this collection belongs to.
+   * @param expected_num_objs - expected number of objects in this collection.
+   * @Return 0 on success, an error code otherwise.
+   */
+  virtual int pre_hash_collection(
+      uint32_t pg_num,            ///< [in] pg number of the pool this collection belongs to
+      uint64_t expected_num_objs  ///< [in] expected number of objects this collection has
+      ) { assert(0); return 0; }
+
   /// Virtual destructor
   virtual ~CollectionIndex() {}
 };
index 4481c93f56b65f409a6dbeadc864118ba5bc84eb..08b29f7300977f9ea9cd1ecb96679cf3e6827cba 100644 (file)
@@ -2378,6 +2378,28 @@ unsigned FileStore::_do_transaction(
       }
       break;
 
+    case Transaction::OP_COLL_HINT:
+      {
+        coll_t cid = i.decode_cid();
+        uint32_t type = i.decode_u32();
+        bufferlist hint;
+        i.decode_bl(hint);
+        bufferlist::iterator hiter = hint.begin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          ::decode(pg_num, hiter);
+          ::decode(num_objs, hiter);
+          if (_check_replay_guard(cid, spos) > 0) {
+            r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+          }
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
     case Transaction::OP_RMCOLL:
       {
        coll_t cid = i.decode_cid();
@@ -4571,6 +4593,34 @@ ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
   return object_map->get_iterator(hoid);
 }
 
+int FileStore::_collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
+    uint64_t expected_num_objs,
+    const SequencerPosition &spos)
+{
+  dout(15) << __func__ << " collection: " << c << " pg number: "
+     << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+  if (!collection_empty(c) && !replaying) {
+    dout(0) << "Failed to give an expected number of objects hint to collection : "
+      << c << ", only empty collection can take such type of hint. " << dendl;
+    return 0;
+  }
+
+  int ret;
+  Index index;
+  ret = get_index(c, &index);
+  if (ret < 0)
+    return ret;
+  // Pre-hash the collection
+  ret = index->pre_hash_collection(pg_num, expected_num_objs);
+  dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+  if (ret < 0)
+    return ret;
+  _set_replay_guard(c, spos);
+
+  return 0;
+}
+
 int FileStore::_create_collection(
   coll_t c,
   const SequencerPosition &spos)
index fcb4796d3b973195f673b83648c3704b99d1eeeb..3c77b05be9376591d2217a939125b9767ccdfaa7 100644 (file)
@@ -623,6 +623,19 @@ public:
   int _create_collection(coll_t c);
   int _create_collection(coll_t c, const SequencerPosition &spos);
   int _destroy_collection(coll_t c);
+  /**
+   * Give an expected number of objects hint to the collection.
+   *
+   * @param c                 - collection id.
+   * @param pg_num            - pg number of the pool this collection belongs to
+   * @param expected_num_objs - expected number of objects in this collection
+   * @param spos              - sequence position
+   *
+   * @Return 0 on success, an error code otherwise
+   */
+  int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
+      uint64_t expected_num_objs,
+      const SequencerPosition &spos);
   int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
                      const SequencerPosition& spos);
   int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
index 346ee0d6154486f5393565110dde39aedc435b87..706d75ada17643459f685ff630c5dbc89c01831e 100644 (file)
@@ -344,6 +344,152 @@ int HashIndex::prep_delete() {
   return recursive_remove(vector<string>());
 }
 
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+  int ret;
+  vector<string> path;
+  subdir_info_s root_info;
+  // Make sure there is neither objects nor sub-folders
+  // in this collection
+  ret = get_info(path, &root_info);
+  if (ret < 0)
+    return ret;
+
+  // Do the folder splitting first
+  ret = pre_split_folder(pg_num, expected_num_objs);
+  if (ret < 0)
+    return ret;
+  // Initialize the folder info starting from root
+  return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+  // If folder merging is enabled (by setting the threshold positive),
+  // no need to split
+  if (merge_threshold > 0)
+    return 0;
+  const coll_t c = coll();
+  // Do not split if the expected number of objects in this collection is zero (by default)
+  if (expected_num_objs == 0)
+    return 0;
+
+  // Calculate the number of leaf folders (which actually store files)
+  // need to be created
+  const uint64_t objs_per_folder = (unsigned)(abs(merge_threshold)) * 16 * split_multiplier;
+  uint64_t leavies = expected_num_objs / objs_per_folder ;
+  // No need to split
+  if (leavies == 0 || expected_num_objs == objs_per_folder)
+    return 0;
+
+  spg_t spgid;
+  if (!c.is_pg_prefix(spgid))
+    return -EINVAL;
+  const ps_t ps = spgid.pgid.ps();
+
+  // the most significant bits of pg_num
+  const int pg_num_bits = calc_num_bits(pg_num - 1);
+  ps_t tmp_id = ps;
+  // calculate the number of levels we only create one sub folder
+  int num = pg_num_bits / 4;
+  // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+  // so that splitting starts at level 3
+  if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+    --num;
+  }
+
+  int ret;
+  // Start with creation that only has one subfolder
+  vector<string> paths;
+  int dump_num = num;
+  while (num-- > 0) {
+    ps_t v = tmp_id & 0x0000000f;
+    paths.push_back(to_hex(v));
+    ret = create_path(paths);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    tmp_id = tmp_id >> 4;
+  }
+
+  // Starting from here, we can split by creating multiple subfolders
+  const int left_bits = pg_num_bits - dump_num * 4;
+  // this variable denotes how many bits (for this level) that can be
+  // used for sub folder splitting
+  int split_bits = 4 - left_bits;
+  // the below logic is inspired by rados.h#ceph_stable_mod,
+  // it basically determines how many sub-folders should we
+  // create for splitting
+  if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+    ++split_bits;
+  }
+  const uint32_t subs = (1 << split_bits);
+  // Calculate how many levels we create starting from here
+  int level  = 0;
+  leavies /= subs;
+  while (leavies > 1) {
+    ++level;
+    leavies = leavies >> 4;
+  }
+  for (uint32_t i = 0; i < subs; ++i) {
+    int v = tmp_id | (i << ((4 - split_bits) % 4));
+    paths.push_back(to_hex(v));
+    ret = create_path(paths);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    ret = recursive_create_path(paths, level);
+    if (ret < 0)
+      return ret;
+    paths.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+  // Get the number of sub directories for the current path
+  set<string> subdirs;
+  int ret = list_subdirs(path, &subdirs);
+  if (ret < 0)
+    return ret;
+  subdir_info_s info;
+  info.subdirs = subdirs.size();
+  info.hash_level = hash_level;
+  ret = set_info(path, info);
+  if (ret < 0)
+    return ret;
+  ret = fsync_dir(path);
+  if (ret < 0)
+    return ret;
+
+  // Do the same for subdirs
+  set<string>::const_iterator iter;
+  for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+    path.push_back(*iter);
+    ret = init_split_folder(path, hash_level + 1);
+    if (ret < 0)
+      return ret;
+    path.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+  if (level == 0)
+    return 0;
+  int ret;
+  for (int i = 0; i < 16; ++i) {
+    path.push_back(to_hex(i));
+    ret = create_path(path);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    ret = recursive_create_path(path, level - 1);
+    if (ret < 0)
+      return ret;
+    path.pop_back();
+  }
+  return 0;
+}
+
 int HashIndex::recursive_remove(const vector<string> &path) {
   set<string> subdirs;
   int r = list_subdirs(path, &subdirs);
index 68bc147248f4e21990bcb1c7bb41ac5c4bf0a15d..dad8ce31b8700e8a0156517e576ae5df88151dc7 100644 (file)
@@ -158,7 +158,7 @@ public:
     uint32_t bits,
     CollectionIndex* dest
     );
-       
+
 protected:
   int _init();
 
@@ -181,6 +181,16 @@ protected:
   int _collection_list(
     vector<ghobject_t> *ls
     );
+
+  /**
+   * Pre-hash the collection to create folders according to the expected number
+   * of objects in this collection.
+   */
+  int _pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      );
+
   int _collection_list_partial(
     const ghobject_t &start,
     int min_count,
@@ -267,6 +277,14 @@ private:
     vector<string> *path   ///< [out] Path components for hoid.
     );
 
+  /// Pre-hash and split folders to avoid runtime splitting
+  /// according to the given expected object number.
+  int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+  /// Initialize the folder (dir info) with the given hash
+  /// level and number of its subdirs.
+  int init_split_folder(vector<string> &path, uint32_t hash_level);
+
   /// do collection split for path
   static int col_split_level(
     HashIndex &from,            ///< [in] from index
@@ -316,6 +334,25 @@ private:
       *bits = path.size() * 4;
   }
 
+  /// Calculate the number of bits.
+  static int calc_num_bits(uint64_t n) {
+    int ret = 0;
+    while (n > 0) {
+      n = n >> 1;
+      ret++;
+    }
+    return ret;
+  }
+
+  /// Convert a number to hex string (upper case).
+  static string to_hex(int n) {
+    assert(n >= 0 && n < 16);
+    char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+    string str;
+    str.append(1, c);
+    return str;
+  }
+
   /// Get path contents by hash
   int get_path_contents_by_hash(
     const vector<string> &path,            /// [in] Path to list
@@ -335,6 +372,10 @@ private:
     ghobject_t *next,            /// [in,out] List objects >= *next
     vector<ghobject_t> *out      /// [out] Listed objects
     ); ///< @return Error Code, 0 on success
+
+  /// Create the given levels of sub directories from the given root.
+  /// The contents of *path* is not changed after calling this function.
+  int recursive_create_path(vector<string>& path, int level);
 };
 
 #endif
index 4f20ef6d778c5bccccddb01bb21638931953924d..34c56f54295ba9cc979565a8d0b362394cb558ec 100644 (file)
@@ -1325,6 +1325,26 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
       }
       break;
 
+    case Transaction::OP_COLL_HINT:
+      {
+        coll_t cid = i.decode_cid();
+        uint32_t type = i.decode_u32();
+        bufferlist hint;
+        i.decode_bl(hint);
+        bufferlist::iterator hiter = hint.begin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          ::decode(pg_num, hiter);
+          ::decode(num_objs, hiter);
+          r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
     case Transaction::OP_RMCOLL:
       {
         coll_t cid = i.decode_cid();
index 1ce5c1927cad7d185fb230d55953a96373829eed..da3aadf92f207661cd3f29ba244c59bf71848445 100644 (file)
@@ -597,6 +597,8 @@ class KeyValueStore : public ObjectStore,
                            BufferTransaction &t);
 
   // collections
+  int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num,
+      uint64_t num_objs) const { return 0; }
   int _create_collection(coll_t c, BufferTransaction &t);
   int _destroy_collection(coll_t c, BufferTransaction &t);
   int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
index 755faa4665beb340db57c40f334375ab12e87c79..c480e6e7cbd3ec5157fd2470a3161b26456e884c 100644 (file)
@@ -148,6 +148,11 @@ int LFNIndex::collection_list(vector<ghobject_t> *ls)
   return _collection_list(ls);
 }
 
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+  return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
 
 int LFNIndex::collection_list_partial(const ghobject_t &start,
                                      int min_count,
index dad6c39d688f6c70a50da40454cec3cf3055aa1b..e594f9eba684975c4f4069a89db83f8b09b6018a 100644 (file)
@@ -182,6 +182,12 @@ public:
     vector<ghobject_t> *ls
     );
 
+  /// @see CollectionIndex;
+  int pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      );
+
   /// @see CollectionIndex
   int collection_list_partial(
     const ghobject_t &start,
@@ -251,6 +257,13 @@ protected:
     vector<ghobject_t> *ls ///< [out] Listed objects.
     ) = 0;
 
+  /// Pre-hash the collection with the given pg number and
+  /// expected number of objects in the collection.
+  virtual int _pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) = 0;
+
   /// @see CollectionIndex
   virtual int _collection_list_partial(
     const ghobject_t &start,
index 3fdab0f0c27f01c3abf206792f0cb89247a8ff8c..6743916b2000b8b98d376b3988152145accd9eef 100644 (file)
@@ -818,6 +818,26 @@ void MemStore::_do_transaction(Transaction& t)
       }
       break;
 
+    case Transaction::OP_COLL_HINT:
+      {
+        coll_t cid = i.decode_cid();
+        uint32_t type = i.decode_u32();
+        bufferlist hint;
+        i.decode_bl(hint);
+        bufferlist::iterator hiter = hint.begin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          ::decode(pg_num, hiter);
+          ::decode(num_objs, hiter);
+          r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
     case Transaction::OP_RMCOLL:
       {
        coll_t cid = i.decode_cid();
index 0fa9d58fc1e6527d7f8ac42411666d5f05b93e18..dbdc7f7be0e2947c5bc5e6288f6cb2b646094b9f 100644 (file)
@@ -207,6 +207,8 @@ private:
                       const string& first, const string& last);
   int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl);
 
+  int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num,
+      uint64_t num_objs) const { return 0; }
   int _create_collection(coll_t c);
   int _destroy_collection(coll_t c);
   int _collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid);
index 1c11c0960cac93e3c6aa36e22b4bc67e5e0cb675..53625b11896e2cec34652030c6b589e4676e9758 100644 (file)
@@ -371,6 +371,11 @@ public:
       OP_COLL_HINT = 40, // cid, type, bl
     };
 
+    // Transaction hint type
+    enum {
+      COLL_HINT_EXPECTED_NUM_OBJECTS = 1,
+    };
+
   private:
     uint64_t ops;
     uint64_t pad_unused_bytes;
index 4bf12fa7af543aa684426ad2fa87df22359b1ac6..81687985efd50c2fef4ac23339dada52a5fe57c9 100644 (file)
@@ -212,6 +212,17 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
         f->dump_string("op_name", "coll_hint");
         f->dump_stream("collection") << cid;
         f->dump_unsigned("type", type);
+        bufferlist hint;
+        i.decode_bl(hint);
+        bufferlist::iterator hiter = hint.begin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          ::decode(pg_num, hiter);
+          ::decode(num_objs, hiter);
+          f->dump_unsigned("pg_num", pg_num);
+          f->dump_unsigned("expected_num_objects", num_objs);
+        }
       }
       break;
 
index b4149b604717c88166bfa4e804cbbd7a0be141ab..a1ac5e912086dfe66697034d8e77b05dc68dd5d7 100644 (file)
@@ -3194,8 +3194,21 @@ void OSD::handle_pg_peering_evt(
     PG::RecoveryCtx rctx = create_context();
     switch (result) {
     case RES_NONE: {
+      const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
+      coll_t cid(pgid);
+
       // ok, create the pg locally using provided Info and History
-      rctx.transaction->create_collection(coll_t(pgid));
+      rctx.transaction->create_collection(cid);
+
+      // Give a hint to the PG collection
+      bufferlist hint;
+      uint32_t pg_num = pp->get_pg_num();
+      uint64_t expected_num_objects_pg = pp->expected_num_objects / pg_num;
+      ::encode(pg_num, hint);
+      ::encode(expected_num_objects_pg, hint);
+      uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
+      rctx.transaction->collection_hint(cid, hint_type, hint);
+
       PG *pg = _create_lock_pg(
        get_map(epoch),
        pgid, create, false, result == RES_SELF,
@@ -7048,8 +7061,20 @@ void OSD::handle_pg_create(OpRequestRef op)
 
     PG *pg = NULL;
     if (can_create_pg(pgid)) {
+      const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
       pg_interval_map_t pi;
-      rctx.transaction->create_collection(coll_t(pgid));
+      coll_t cid(pgid);
+      rctx.transaction->create_collection(cid);
+
+      // Give a hint to the PG collection
+      bufferlist hint;
+      uint32_t pg_num = pp->get_pg_num();
+      uint64_t expected_num_objects_pg = pp->expected_num_objects / pg_num;
+      ::encode(pg_num, hint);
+      ::encode(expected_num_objects_pg, hint);
+      uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
+      rctx.transaction->collection_hint(cid, hint_type, hint);
+
       pg = _create_lock_pg(
        osdmap, pgid, true, false, false,
        0, creating_pgs[pgid].acting, whoami,