From: Samuel Just <sam.just@inktank.com>
Date: Thu, 4 Jun 2015 18:25:12 +0000 (-0700)
Subject: Merge pull request #4717 from athanatos/wip-8635
X-Git-Tag: v9.0.2~48
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b7abc398c03fa20a95e3081a2c09f11abc2cadf4;p=ceph.git

Merge pull request #4717 from athanatos/wip-8635

Wip 8635 -- Move scrub, snap_trim into a unified queue

Reviewed-by: Kefu Chai <kchai@redhat.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---

b7abc398c03fa20a95e3081a2c09f11abc2cadf4
diff --cc src/common/config_opts.h
index 8c5a2bf291d0,998e68c61b5c..d762a2c9de36
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@@ -708,23 -702,47 +703,20 @@@ OPTION(kinetic_hmac_key, OPT_STR, "asdf
  OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic with TLS
  
  
 -//in memory write buffer configuration
 -OPTION(rocksdb_write_buffer_size, OPT_U64, 8*1024*1024) // rocksdb write buffer size, should be larger than average write size.
 -OPTION(rocksdb_write_buffer_num, OPT_INT, 2) // The maximum number of write buffers that are built up in memory.
 -OPTION(rocksdb_min_write_buffer_number_to_merge, OPT_INT, 1) // The min write buffers that will be merged together before writing to storage.
 -//on disk level0 configuration
 -OPTION(rocksdb_level0_file_num_compaction_trigger, OPT_INT, 4) // Number of files to trigger level-0 compaction
 -OPTION(rocksdb_level0_slowdown_writes_trigger, OPT_INT, -1)  // number of level-0 files at which we start slowing down write. -1 means not set.
 -OPTION(rocksdb_level0_stop_writes_trigger, OPT_INT, -1)  // number of level-0 files at which we stop writes. -1 means not set.
 -//on disk level1+ configuration
 -OPTION(rocksdb_max_bytes_for_level_base, OPT_U64, 10*1024*1024)  // max total bytes for level 1
 -OPTION(rocksdb_max_bytes_for_level_multiplier, OPT_INT, 10)  // max total bytes for level 1
 -OPTION(rocksdb_target_file_size_base, OPT_U64, 2*1024*1024) // target file size for level 1
 -OPTION(rocksdb_target_file_size_multiplier, OPT_INT, 1) // target file size for Level-N = (multiplier)^(N-1) * file_size_base
 -OPTION(rocksdb_num_levels, OPT_INT, 7) // number of levels for this database,chang
 -OPTION(rocksdb_cache_size, OPT_U64, 0) // rocksdb cache size
 -OPTION(rocksdb_block_size, OPT_U64, 4*1024) // rocksdb block size
 -OPTION(rocksdb_bloom_bits_per_key, OPT_INT, 10) // rocksdb bloom bits per entry
 -//concurrency of compaction and flush
 -OPTION(rocksdb_max_background_compactions, OPT_INT, 1) // number for background compaction jobs
 -OPTION(rocksdb_compaction_threads, OPT_INT, 1) // number for background compaction jobs
 -OPTION(rocksdb_max_background_flushes, OPT_INT, 1) // number for background flush jobs
 -OPTION(rocksdb_flusher_threads, OPT_INT, 1) // number for background compaction jobs
 -//Other
 -OPTION(rocksdb_max_open_files, OPT_INT, 5000) // rocksdb max open files
 -OPTION(rocksdb_compression, OPT_STR, "snappy") // rocksdb uses compression : none, snappy, zlib, bzip2
 -OPTION(rocksdb_compact_on_mount, OPT_BOOL, false)
 -OPTION(rocksdb_paranoid, OPT_BOOL, false) // RocksDB will aggressively check consistency of the data.
 -OPTION(rocksdb_log, OPT_STR, "/dev/null")  // enable rocksdb log file
 -OPTION(rocksdb_info_log_level, OPT_STR, "info")  // info log level : debug , info , warn, error, fatal
 -OPTION(rocksdb_wal_dir, OPT_STR, "")  //  rocksdb write ahead log file, put it to fast device will benifit wrtie performance
 -OPTION(rocksdb_disableDataSync, OPT_BOOL, false) // if true, data files are not synced to stable storage
 -OPTION(rocksdb_disableWAL, OPT_BOOL, false)  // if true, writes will not first go to the write ahead log
 -
 +// rocksdb options that will be used for keyvaluestore(if backend is rocksdb)
 +OPTION(keyvaluestore_rocksdb_options, OPT_STR, "")
 +// rocksdb options that will be used for omap(if omap_backend is rocksdb)
 +OPTION(filestore_rocksdb_options, OPT_STR, "")
 +// rocksdb options that will be used in monstore
 +OPTION(mon_rocksdb_options, OPT_STR, "")
  
  /**
-  * osd_client_op_priority and osd_recovery_op_priority adjust the relative
-  * priority of client io vs recovery io.
+  * osd_*_priority adjust the relative priority of client io, recovery io,
+  * snaptrim io, etc
   *
-  * osd_client_op_priority/osd_recovery_op_priority determines the ratio of
-  * available io between client and recovery.  Each option may be set between
+  * osd_*_priority determines the ratio of available io between client and
+  * recovery.  Each option may be set between
   * 1..63.
-  *
-  * osd_recovery_op_warn_multiple scales the normal warning threshhold,
-  * osd_op_complaint_time, so that slow recovery ops won't cause noise
   */
  OPTION(osd_client_op_priority, OPT_U32, 63)
  OPTION(osd_recovery_op_priority, OPT_U32, 10)
diff --cc src/osd/OSD.h
index 2cf6819f596b,df968594edf6..85452057ca38
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@@ -2092,150 -2175,9 +2189,9 @@@ protected
    // -- scrubbing --
    void sched_scrub();
    bool scrub_random_backoff();
 -  bool scrub_should_schedule();
 +  bool scrub_load_below_threshold();
    bool scrub_time_permit(utime_t now);
  
-   xlist<PG*> scrub_queue;
- 
-   struct ScrubWQ : public ThreadPool::WorkQueue<PG> {
-     OSD *osd;
-     ScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
-       : ThreadPool::WorkQueue<PG>("OSD::ScrubWQ", ti, si, tp), osd(o) {}
- 
-     bool _empty() {
-       return osd->scrub_queue.empty();
-     }
-     bool _enqueue(PG *pg) {
-       if (pg->scrub_item.is_on_list()) {
- 	return false;
-       }
-       pg->get("ScrubWQ");
-       osd->scrub_queue.push_back(&pg->scrub_item);
-       return true;
-     }
-     void _dequeue(PG *pg) {
-       if (pg->scrub_item.remove_myself()) {
- 	pg->put("ScrubWQ");
-       }
-     }
-     PG *_dequeue() {
-       if (osd->scrub_queue.empty())
- 	return NULL;
-       PG *pg = osd->scrub_queue.front();
-       osd->scrub_queue.pop_front();
-       return pg;
-     }
-     void _process(
-       PG *pg,
-       ThreadPool::TPHandle &handle) {
-       pg->scrub(handle);
-       pg->put("ScrubWQ");
-     }
-     void _clear() {
-       while (!osd->scrub_queue.empty()) {
- 	PG *pg = osd->scrub_queue.front();
- 	osd->scrub_queue.pop_front();
- 	pg->put("ScrubWQ");
-       }
-     }
-   } scrub_wq;
- 
-   struct RepScrubWQ : public ThreadPool::WorkQueue<MOSDRepScrub> {
-   private: 
-     OSD *osd;
-     list<MOSDRepScrub*> rep_scrub_queue;
- 
-   public:
-     RepScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
-       : ThreadPool::WorkQueue<MOSDRepScrub>("OSD::RepScrubWQ", ti, si, tp), osd(o) {}
- 
-     bool _empty() {
-       return rep_scrub_queue.empty();
-     }
-     bool _enqueue(MOSDRepScrub *msg) {
-       rep_scrub_queue.push_back(msg);
-       return true;
-     }
-     void _dequeue(MOSDRepScrub *msg) {
-       assert(0); // Not applicable for this wq
-       return;
-     }
-     MOSDRepScrub *_dequeue() {
-       if (rep_scrub_queue.empty())
- 	return NULL;
-       MOSDRepScrub *msg = rep_scrub_queue.front();
-       rep_scrub_queue.pop_front();
-       return msg;
-     }
-     void _process(
-       MOSDRepScrub *msg,
-       ThreadPool::TPHandle &handle) {
-       PG *pg = NULL;
-       {
- 	Mutex::Locker lock(osd->osd_lock);
- 	if (osd->is_stopping() ||
- 	    !osd->_have_pg(msg->pgid)) {
- 	  msg->put();
- 	  return;
- 	}
- 	pg = osd->_lookup_lock_pg(msg->pgid);
-       }
-       assert(pg);
-       pg->replica_scrub(msg, handle);
-       msg->put();
-       pg->unlock();
-     }
-     void _clear() {
-       while (!rep_scrub_queue.empty()) {
- 	MOSDRepScrub *msg = rep_scrub_queue.front();
- 	rep_scrub_queue.pop_front();
- 	msg->put();
-       }
-     }
-   } rep_scrub_wq;
- 
    // -- removing --
    struct RemoveWQ :
      public ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> > {