CollectionIndex():access_lock("CollectionIndex::access_lock"){}
+ /*
+ * Pre-hash the collection, this collection should map to a PG folder.
+ *
+ * @param pg_num - pg number of the pool this collection belongs to.
+ * @param expected_num_objs - expected number of objects in this collection.
+ * @Return 0 on success, an error code otherwise.
+ */
+ virtual int pre_hash_collection(
+ uint32_t pg_num, ///< [in] pg number of the pool this collection belongs to
+ uint64_t expected_num_objs ///< [in] expected number of objects this collection has
+ ) { assert(0); return 0; }
+
/// Virtual destructor
virtual ~CollectionIndex() {}
};
}
break;
+ case Transaction::OP_COLL_HINT:
+ {
+ coll_t cid = i.decode_cid();
+ uint32_t type = i.decode_u32();
+ bufferlist hint;
+ i.decode_bl(hint);
+ bufferlist::iterator hiter = hint.begin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ ::decode(pg_num, hiter);
+ ::decode(num_objs, hiter);
+ if (_check_replay_guard(cid, spos) > 0) {
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+ }
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
case Transaction::OP_RMCOLL:
{
coll_t cid = i.decode_cid();
return object_map->get_iterator(hoid);
}
+int FileStore::_collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos)
+{
+ dout(15) << __func__ << " collection: " << c << " pg number: "
+ << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+ if (!collection_empty(c) && !replaying) {
+ dout(0) << "Failed to give an expected number of objects hint to collection : "
+ << c << ", only empty collection can take such type of hint. " << dendl;
+ return 0;
+ }
+
+ int ret;
+ Index index;
+ ret = get_index(c, &index);
+ if (ret < 0)
+ return ret;
+ // Pre-hash the collection
+ ret = index->pre_hash_collection(pg_num, expected_num_objs);
+ dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+ if (ret < 0)
+ return ret;
+ _set_replay_guard(c, spos);
+
+ return 0;
+}
+
int FileStore::_create_collection(
coll_t c,
const SequencerPosition &spos)
int _create_collection(coll_t c);
int _create_collection(coll_t c, const SequencerPosition &spos);
int _destroy_collection(coll_t c);
+ /**
+ * Give an expected number of objects hint to the collection.
+ *
+ * @param c - collection id.
+ * @param pg_num - pg number of the pool this collection belongs to
+ * @param expected_num_objs - expected number of objects in this collection
+ * @param spos - sequence position
+ *
+ * @Return 0 on success, an error code otherwise
+ */
+ int _collection_hint_expected_num_objs(coll_t c, uint32_t pg_num,
+ uint64_t expected_num_objs,
+ const SequencerPosition &spos);
int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
const SequencerPosition& spos);
int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
return recursive_remove(vector<string>());
}
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+ int ret;
+ vector<string> path;
+ subdir_info_s root_info;
+ // Make sure there is neither objects nor sub-folders
+ // in this collection
+ ret = get_info(path, &root_info);
+ if (ret < 0)
+ return ret;
+
+ // Do the folder splitting first
+ ret = pre_split_folder(pg_num, expected_num_objs);
+ if (ret < 0)
+ return ret;
+ // Initialize the folder info starting from root
+ return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ // If folder merging is enabled (by setting the threshold positive),
+ // no need to split
+ if (merge_threshold > 0)
+ return 0;
+ const coll_t c = coll();
+ // Do not split if the expected number of objects in this collection is zero (by default)
+ if (expected_num_objs == 0)
+ return 0;
+
+ // Calculate the number of leaf folders (which actually store files)
+ // need to be created
+ const uint64_t objs_per_folder = (unsigned)(abs(merge_threshold)) * 16 * split_multiplier;
+ uint64_t leavies = expected_num_objs / objs_per_folder ;
+ // No need to split
+ if (leavies == 0 || expected_num_objs == objs_per_folder)
+ return 0;
+
+ spg_t spgid;
+ if (!c.is_pg_prefix(spgid))
+ return -EINVAL;
+ const ps_t ps = spgid.pgid.ps();
+
+ // the most significant bits of pg_num
+ const int pg_num_bits = calc_num_bits(pg_num - 1);
+ ps_t tmp_id = ps;
+ // calculate the number of levels we only create one sub folder
+ int num = pg_num_bits / 4;
+ // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+ // so that splitting starts at level 3
+ if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+ --num;
+ }
+
+ int ret;
+ // Start with creation that only has one subfolder
+ vector<string> paths;
+ int dump_num = num;
+ while (num-- > 0) {
+ ps_t v = tmp_id & 0x0000000f;
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ tmp_id = tmp_id >> 4;
+ }
+
+ // Starting from here, we can split by creating multiple subfolders
+ const int left_bits = pg_num_bits - dump_num * 4;
+ // this variable denotes how many bits (for this level) that can be
+ // used for sub folder splitting
+ int split_bits = 4 - left_bits;
+ // the below logic is inspired by rados.h#ceph_stable_mod,
+ // it basically determines how many sub-folders should we
+ // create for splitting
+ if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+ ++split_bits;
+ }
+ const uint32_t subs = (1 << split_bits);
+ // Calculate how many levels we create starting from here
+ int level = 0;
+ leavies /= subs;
+ while (leavies > 1) {
+ ++level;
+ leavies = leavies >> 4;
+ }
+ for (uint32_t i = 0; i < subs; ++i) {
+ int v = tmp_id | (i << ((4 - split_bits) % 4));
+ paths.push_back(to_hex(v));
+ ret = create_path(paths);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(paths, level);
+ if (ret < 0)
+ return ret;
+ paths.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+ // Get the number of sub directories for the current path
+ set<string> subdirs;
+ int ret = list_subdirs(path, &subdirs);
+ if (ret < 0)
+ return ret;
+ subdir_info_s info;
+ info.subdirs = subdirs.size();
+ info.hash_level = hash_level;
+ ret = set_info(path, info);
+ if (ret < 0)
+ return ret;
+ ret = fsync_dir(path);
+ if (ret < 0)
+ return ret;
+
+ // Do the same for subdirs
+ set<string>::const_iterator iter;
+ for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+ path.push_back(*iter);
+ ret = init_split_folder(path, hash_level + 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+ if (level == 0)
+ return 0;
+ int ret;
+ for (int i = 0; i < 16; ++i) {
+ path.push_back(to_hex(i));
+ ret = create_path(path);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ ret = recursive_create_path(path, level - 1);
+ if (ret < 0)
+ return ret;
+ path.pop_back();
+ }
+ return 0;
+}
+
int HashIndex::recursive_remove(const vector<string> &path) {
set<string> subdirs;
int r = list_subdirs(path, &subdirs);
uint32_t bits,
CollectionIndex* dest
);
-
+
protected:
int _init();
int _collection_list(
vector<ghobject_t> *ls
);
+
+ /**
+ * Pre-hash the collection to create folders according to the expected number
+ * of objects in this collection.
+ */
+ int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ );
+
int _collection_list_partial(
const ghobject_t &start,
int min_count,
vector<string> *path ///< [out] Path components for hoid.
);
+ /// Pre-hash and split folders to avoid runtime splitting
+ /// according to the given expected object number.
+ int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+ /// Initialize the folder (dir info) with the given hash
+ /// level and number of its subdirs.
+ int init_split_folder(vector<string> &path, uint32_t hash_level);
+
/// do collection split for path
static int col_split_level(
HashIndex &from, ///< [in] from index
*bits = path.size() * 4;
}
+ /// Calculate the number of bits.
+ static int calc_num_bits(uint64_t n) {
+ int ret = 0;
+ while (n > 0) {
+ n = n >> 1;
+ ret++;
+ }
+ return ret;
+ }
+
+ /// Convert a number to hex string (upper case).
+ static string to_hex(int n) {
+ assert(n >= 0 && n < 16);
+ char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+ string str;
+ str.append(1, c);
+ return str;
+ }
+
/// Get path contents by hash
int get_path_contents_by_hash(
const vector<string> &path, /// [in] Path to list
ghobject_t *next, /// [in,out] List objects >= *next
vector<ghobject_t> *out /// [out] Listed objects
); ///< @return Error Code, 0 on success
+
+ /// Create the given levels of sub directories from the given root.
+ /// The contents of *path* is not changed after calling this function.
+ int recursive_create_path(vector<string>& path, int level);
};
#endif
}
break;
+ case Transaction::OP_COLL_HINT:
+ {
+ coll_t cid = i.decode_cid();
+ uint32_t type = i.decode_u32();
+ bufferlist hint;
+ i.decode_bl(hint);
+ bufferlist::iterator hiter = hint.begin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ ::decode(pg_num, hiter);
+ ::decode(num_objs, hiter);
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
case Transaction::OP_RMCOLL:
{
coll_t cid = i.decode_cid();
BufferTransaction &t);
// collections
+ int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num,
+ uint64_t num_objs) const { return 0; }
int _create_collection(coll_t c, BufferTransaction &t);
int _destroy_collection(coll_t c, BufferTransaction &t);
int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
return _collection_list(ls);
}
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+ return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
int LFNIndex::collection_list_partial(const ghobject_t &start,
int min_count,
vector<ghobject_t> *ls
);
+ /// @see CollectionIndex;
+ int pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ );
+
/// @see CollectionIndex
int collection_list_partial(
const ghobject_t &start,
vector<ghobject_t> *ls ///< [out] Listed objects.
) = 0;
+ /// Pre-hash the collection with the given pg number and
+ /// expected number of objects in the collection.
+ virtual int _pre_hash_collection(
+ uint32_t pg_num,
+ uint64_t expected_num_objs
+ ) = 0;
+
/// @see CollectionIndex
virtual int _collection_list_partial(
const ghobject_t &start,
}
break;
+ case Transaction::OP_COLL_HINT:
+ {
+ coll_t cid = i.decode_cid();
+ uint32_t type = i.decode_u32();
+ bufferlist hint;
+ i.decode_bl(hint);
+ bufferlist::iterator hiter = hint.begin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ ::decode(pg_num, hiter);
+ ::decode(num_objs, hiter);
+ r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+ } else {
+ // Ignore the hint
+ dout(10) << "Unrecognized collection hint type: " << type << dendl;
+ }
+ }
+ break;
+
case Transaction::OP_RMCOLL:
{
coll_t cid = i.decode_cid();
const string& first, const string& last);
int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl);
+ int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num,
+ uint64_t num_objs) const { return 0; }
int _create_collection(coll_t c);
int _destroy_collection(coll_t c);
int _collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid);
OP_COLL_HINT = 40, // cid, type, bl
};
+ // Transaction hint type
+ enum {
+ COLL_HINT_EXPECTED_NUM_OBJECTS = 1,
+ };
+
private:
uint64_t ops;
uint64_t pad_unused_bytes;
f->dump_string("op_name", "coll_hint");
f->dump_stream("collection") << cid;
f->dump_unsigned("type", type);
+ bufferlist hint;
+ i.decode_bl(hint);
+ bufferlist::iterator hiter = hint.begin();
+ if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+ uint32_t pg_num;
+ uint64_t num_objs;
+ ::decode(pg_num, hiter);
+ ::decode(num_objs, hiter);
+ f->dump_unsigned("pg_num", pg_num);
+ f->dump_unsigned("expected_num_objects", num_objs);
+ }
}
break;
PG::RecoveryCtx rctx = create_context();
switch (result) {
case RES_NONE: {
+ const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
+ coll_t cid(pgid);
+
// ok, create the pg locally using provided Info and History
- rctx.transaction->create_collection(coll_t(pgid));
+ rctx.transaction->create_collection(cid);
+
+ // Give a hint to the PG collection
+ bufferlist hint;
+ uint32_t pg_num = pp->get_pg_num();
+ uint64_t expected_num_objects_pg = pp->expected_num_objects / pg_num;
+ ::encode(pg_num, hint);
+ ::encode(expected_num_objects_pg, hint);
+ uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
+ rctx.transaction->collection_hint(cid, hint_type, hint);
+
PG *pg = _create_lock_pg(
get_map(epoch),
pgid, create, false, result == RES_SELF,
PG *pg = NULL;
if (can_create_pg(pgid)) {
+ const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
pg_interval_map_t pi;
- rctx.transaction->create_collection(coll_t(pgid));
+ coll_t cid(pgid);
+ rctx.transaction->create_collection(cid);
+
+ // Give a hint to the PG collection
+ bufferlist hint;
+ uint32_t pg_num = pp->get_pg_num();
+ uint64_t expected_num_objects_pg = pp->expected_num_objects / pg_num;
+ ::encode(pg_num, hint);
+ ::encode(expected_num_objects_pg, hint);
+ uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
+ rctx.transaction->collection_hint(cid, hint_type, hint);
+
pg = _create_lock_pg(
osdmap, pgid, true, false, false,
0, creating_pgs[pgid].acting, whoami,