From: Samuel Just Date: Thu, 4 Jun 2015 18:25:12 +0000 (-0700) Subject: Merge pull request #4717 from athanatos/wip-8635 X-Git-Tag: v9.0.2~48 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b7abc398c03fa20a95e3081a2c09f11abc2cadf4;p=ceph.git Merge pull request #4717 from athanatos/wip-8635 Wip 8635 -- Move scrub, snap_trim into a unified queue Reviewed-by: Kefu Chai Reviewed-by: Sage Weil --- b7abc398c03fa20a95e3081a2c09f11abc2cadf4 diff --cc src/common/config_opts.h index 8c5a2bf291d0,998e68c61b5c..d762a2c9de36 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@@ -708,23 -702,47 +703,20 @@@ OPTION(kinetic_hmac_key, OPT_STR, "asdf OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic with TLS -//in memory write buffer configuration -OPTION(rocksdb_write_buffer_size, OPT_U64, 8*1024*1024) // rocksdb write buffer size, should be larger than average write size. -OPTION(rocksdb_write_buffer_num, OPT_INT, 2) // The maximum number of write buffers that are built up in memory. -OPTION(rocksdb_min_write_buffer_number_to_merge, OPT_INT, 1) // The min write buffers that will be merged together before writing to storage. -//on disk level0 configuration -OPTION(rocksdb_level0_file_num_compaction_trigger, OPT_INT, 4) // Number of files to trigger level-0 compaction -OPTION(rocksdb_level0_slowdown_writes_trigger, OPT_INT, -1) // number of level-0 files at which we start slowing down write. -1 means not set. -OPTION(rocksdb_level0_stop_writes_trigger, OPT_INT, -1) // number of level-0 files at which we stop writes. -1 means not set. -//on disk level1+ configuration -OPTION(rocksdb_max_bytes_for_level_base, OPT_U64, 10*1024*1024) // max total bytes for level 1 -OPTION(rocksdb_max_bytes_for_level_multiplier, OPT_INT, 10) // max total bytes for level 1 -OPTION(rocksdb_target_file_size_base, OPT_U64, 2*1024*1024) // target file size for level 1 -OPTION(rocksdb_target_file_size_multiplier, OPT_INT, 1) // target file size for Level-N = (multiplier)^(N-1) * file_size_base -OPTION(rocksdb_num_levels, OPT_INT, 7) // number of levels for this database,chang -OPTION(rocksdb_cache_size, OPT_U64, 0) // rocksdb cache size -OPTION(rocksdb_block_size, OPT_U64, 4*1024) // rocksdb block size -OPTION(rocksdb_bloom_bits_per_key, OPT_INT, 10) // rocksdb bloom bits per entry -//concurrency of compaction and flush -OPTION(rocksdb_max_background_compactions, OPT_INT, 1) // number for background compaction jobs -OPTION(rocksdb_compaction_threads, OPT_INT, 1) // number for background compaction jobs -OPTION(rocksdb_max_background_flushes, OPT_INT, 1) // number for background flush jobs -OPTION(rocksdb_flusher_threads, OPT_INT, 1) // number for background compaction jobs -//Other -OPTION(rocksdb_max_open_files, OPT_INT, 5000) // rocksdb max open files -OPTION(rocksdb_compression, OPT_STR, "snappy") // rocksdb uses compression : none, snappy, zlib, bzip2 -OPTION(rocksdb_compact_on_mount, OPT_BOOL, false) -OPTION(rocksdb_paranoid, OPT_BOOL, false) // RocksDB will aggressively check consistency of the data. -OPTION(rocksdb_log, OPT_STR, "/dev/null") // enable rocksdb log file -OPTION(rocksdb_info_log_level, OPT_STR, "info") // info log level : debug , info , warn, error, fatal -OPTION(rocksdb_wal_dir, OPT_STR, "") // rocksdb write ahead log file, put it to fast device will benifit wrtie performance -OPTION(rocksdb_disableDataSync, OPT_BOOL, false) // if true, data files are not synced to stable storage -OPTION(rocksdb_disableWAL, OPT_BOOL, false) // if true, writes will not first go to the write ahead log - +// rocksdb options that will be used for keyvaluestore(if backend is rocksdb) +OPTION(keyvaluestore_rocksdb_options, OPT_STR, "") +// rocksdb options that will be used for omap(if omap_backend is rocksdb) +OPTION(filestore_rocksdb_options, OPT_STR, "") +// rocksdb options that will be used in monstore +OPTION(mon_rocksdb_options, OPT_STR, "") /** - * osd_client_op_priority and osd_recovery_op_priority adjust the relative - * priority of client io vs recovery io. + * osd_*_priority adjust the relative priority of client io, recovery io, + * snaptrim io, etc * - * osd_client_op_priority/osd_recovery_op_priority determines the ratio of - * available io between client and recovery. Each option may be set between + * osd_*_priority determines the ratio of available io between client and + * recovery. Each option may be set between * 1..63. - * - * osd_recovery_op_warn_multiple scales the normal warning threshhold, - * osd_op_complaint_time, so that slow recovery ops won't cause noise */ OPTION(osd_client_op_priority, OPT_U32, 63) OPTION(osd_recovery_op_priority, OPT_U32, 10) diff --cc src/osd/OSD.h index 2cf6819f596b,df968594edf6..85452057ca38 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@@ -2092,150 -2175,9 +2189,9 @@@ protected // -- scrubbing -- void sched_scrub(); bool scrub_random_backoff(); - bool scrub_should_schedule(); + bool scrub_load_below_threshold(); bool scrub_time_permit(utime_t now); - xlist scrub_queue; - - struct ScrubWQ : public ThreadPool::WorkQueue { - OSD *osd; - ScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp) - : ThreadPool::WorkQueue("OSD::ScrubWQ", ti, si, tp), osd(o) {} - - bool _empty() { - return osd->scrub_queue.empty(); - } - bool _enqueue(PG *pg) { - if (pg->scrub_item.is_on_list()) { - return false; - } - pg->get("ScrubWQ"); - osd->scrub_queue.push_back(&pg->scrub_item); - return true; - } - void _dequeue(PG *pg) { - if (pg->scrub_item.remove_myself()) { - pg->put("ScrubWQ"); - } - } - PG *_dequeue() { - if (osd->scrub_queue.empty()) - return NULL; - PG *pg = osd->scrub_queue.front(); - osd->scrub_queue.pop_front(); - return pg; - } - void _process( - PG *pg, - ThreadPool::TPHandle &handle) { - pg->scrub(handle); - pg->put("ScrubWQ"); - } - void _clear() { - while (!osd->scrub_queue.empty()) { - PG *pg = osd->scrub_queue.front(); - osd->scrub_queue.pop_front(); - pg->put("ScrubWQ"); - } - } - } scrub_wq; - - struct RepScrubWQ : public ThreadPool::WorkQueue { - private: - OSD *osd; - list rep_scrub_queue; - - public: - RepScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp) - : ThreadPool::WorkQueue("OSD::RepScrubWQ", ti, si, tp), osd(o) {} - - bool _empty() { - return rep_scrub_queue.empty(); - } - bool _enqueue(MOSDRepScrub *msg) { - rep_scrub_queue.push_back(msg); - return true; - } - void _dequeue(MOSDRepScrub *msg) { - assert(0); // Not applicable for this wq - return; - } - MOSDRepScrub *_dequeue() { - if (rep_scrub_queue.empty()) - return NULL; - MOSDRepScrub *msg = rep_scrub_queue.front(); - rep_scrub_queue.pop_front(); - return msg; - } - void _process( - MOSDRepScrub *msg, - ThreadPool::TPHandle &handle) { - PG *pg = NULL; - { - Mutex::Locker lock(osd->osd_lock); - if (osd->is_stopping() || - !osd->_have_pg(msg->pgid)) { - msg->put(); - return; - } - pg = osd->_lookup_lock_pg(msg->pgid); - } - assert(pg); - pg->replica_scrub(msg, handle); - msg->put(); - pg->unlock(); - } - void _clear() { - while (!rep_scrub_queue.empty()) { - MOSDRepScrub *msg = rep_scrub_queue.front(); - rep_scrub_queue.pop_front(); - msg->put(); - } - } - } rep_scrub_wq; - // -- removing -- struct RemoveWQ : public ThreadPool::WorkQueueVal > {