* Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default.
If any OSD has repaired more than this many I/O errors in stored data a
- ``OSD_TOO_MANY_REPAIRS`` health warning is generated.
+ ``OSD_TOO_MANY_REPAIRS`` health warning is generated. In order to allow
+ clearing of the warning, a new command ``ceph tell osd.# clear_shards_repaired [count]``
+ has been added. By default it will set the repair count to 0. If you wanted
+ to be warned again if additional repairs are performed you can provide a value
+ to the command and specify the value of ``mon_osd_warn_num_repaired``.
+ This command will be replaced in future releases by the health mute/unmute feature.
it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10,
this health warning is generated.
+In order to allow clearing of the warning, a new command
+``ceph tell osd.# clear_shards_repaired [count]`` has been added.
+By default it will set the repair count to 0. If the administrator wanted
+to re-enable the warning if any additional repairs are performed you can provide
+a value to the command and specify the value of ``mon_osd_warn_num_repaired``.
+This command will be replaced in future releases by the health mute/unmute feature.
+
LARGE_OMAP_OBJECTS
__________________
ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1
ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1
- ceph health mute OSD_TOO_MANY_REPAIRS
+ ceph tell osd.$primary clear_shards_repaired
+ sleep 10
+
set -o pipefail
# Should mute this
ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
set +o pipefail
+ ceph tell osd.$primary clear_shards_repaired $OBJS
+ sleep 10
+
for i in $(seq 1 $OBJS)
do
inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
- # Give mon a chance to notice additional OSD and unmute
+ # Give mon a chance to notice additional OSD and reset num_shards_repaired
# The default tick time is 5 seconds
CHECKTIME=10
LOOPS=0
return;
}
+void OSDService::set_osd_stat_repaired(int64_t count)
+{
+ std::lock_guard l(stat_lock);
+ osd_stat.num_shards_repaired = count;
+ return;
+}
+
float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
uint64_t adjust_used)
{
"name=message,type=CephString,n=N",
"log a message to the cluster log",
"osd", "rw")
+COMMAND("clear_shards_repaired " \
+ "name=count,type=CephInt,req=false",
+ "clear num_shards_repaired to clear health warning",
+ "osd", "rw")
COMMAND("bench " \
"name=count,type=CephInt,req=false " \
"name=size,type=CephInt,req=false " \
}
clog->do_log(level, message);
}
+ else if (prefix == "clear_shards_repaired") {
+ int64_t count;
+ cmd_getval(cct, cmdmap, "count", count, (int64_t) 0);
+ service.set_osd_stat_repaired(count);
+ }
// either 'pg <pgid> <command>' or
// 'tell <pgid>' (which comes in without any of that prefix)?
osd_alert_list_t& alerts);
osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
void inc_osd_stat_repaired(void);
+ void set_osd_stat_repaired(int64_t);
float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
osd_stat_t get_osd_stat() {
std::lock_guard l(stat_lock);
virtual bool pg_is_repair() = 0;
virtual void inc_osd_stat_repaired() = 0;
+ virtual void set_osd_stat_repaired(int64_t) = 0;
virtual bool pg_is_remote_backfilling() = 0;
virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
void inc_osd_stat_repaired() override {
osd->inc_osd_stat_repaired();
}
+ void set_osd_stat_repaired(int64_t count) override {
+ osd->set_osd_stat_repaired(count);
+ }
bool pg_is_remote_backfilling() override {
return is_remote_backfilling();
}