Add a "lock_latency" perf counter to the per-zone data sync counter.
This tracks the latency of RADOS lock/unlock operations in
RGWContinuousLeaseCR, giving operators visibility into the values
driving the LatencyConcurrencyControl.
The new perf counter can be queried via the admin socket:
ceph daemon <asok> perf dump data-sync-from-<zone>
and reset independently:
ceph daemon <asok> perf reset data-sync-from-<zone>
This would allow us to distinguish a poisoned average from ongoing
OSD latency issues without restarting the RGW process.
Signed-off-by: Oguzhan Ozmen <oozmen@bloomberg.net>
current_time = ceph::coarse_mono_clock::now();
yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
if (latency) {
- latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+ auto elapsed = ceph::coarse_mono_clock::now() - current_time;
+ latency->add_latency(elapsed);
+ if (counters) {
+ counters->tinc(sync_counters::l_lock, elapsed);
+ }
}
current_time = ceph::coarse_mono_clock::now();
if (current_time - last_renew_try_time > interval_tolerance) {
current_time = ceph::coarse_mono_clock::now();
yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie));
if (latency) {
- latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+ auto elapsed = ceph::coarse_mono_clock::now() - current_time;
+ latency->add_latency(elapsed);
+ if (counters) {
+ counters->tinc(sync_counters::l_lock, elapsed);
+ }
}
return set_state(RGWCoroutine_Done);
}
#include "services/svc_sys_obj.h"
#include "services/svc_bucket.h"
+#include "include/common_fwd.h"
struct rgw_http_param_pair;
class RGWRESTConn;
ceph::coarse_mono_time current_time;
LatencyMonitor* latency;
+ PerfCounters* counters;
public:
RGWContinuousLeaseCR(RGWAsyncRadosProcessor* async_rados,
rgw::sal::RadosStore* _store,
rgw_raw_obj obj, std::string lock_name,
int interval, RGWCoroutine* caller,
- LatencyMonitor* const latency)
+ LatencyMonitor* const latency,
+ PerfCounters* counters = nullptr)
: RGWCoroutine(_store->ctx()), async_rados(async_rados), store(_store),
obj(std::move(obj)), lock_name(std::move(lock_name)),
interval(interval), interval_tolerance(ceph::make_timespan(9*interval/10)),
- ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency)
+ ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency),
+ counters(counters)
{}
virtual ~RGWContinuousLeaseCR() override;
sc->env->async_rados, sc->env->driver,
{ sc->env->svc->zone->get_zone_params().log_pool,
RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) },
- string(lock_name), lock_duration, caller, &sc->lcc);
+ string(lock_name), lock_duration, caller, &sc->lcc, sc->env->counters);
}
int operate(const DoutPrefixProvider *dpp) override {
lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver,
rgw_raw_obj(pool, status_oid),
lock_name, lock_duration, this,
- &sc->lcc));
+ &sc->lcc, sync_env->counters));
lease_stack.reset(spawn(lease_cr.get(), false));
}
};
if (!no_lease && !bucket_lease_cr) {
bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
- lock_name, lock_duration, this, &sc->lcc));
+ lock_name, lock_duration, this, &sc->lcc, env->counters));
yield spawn(bucket_lease_cr.get(), false);
while (!bucket_lease_cr->is_locked()) {
if (bucket_lease_cr->is_done()) {
// so the command is never blocked by a background sync process holding the lock.
if (!no_lease && !bucket_lease_cr) {
bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
- lock_name, lock_duration, this, &sc->lcc));
+ lock_name, lock_duration, this, &sc->lcc, env->counters));
yield spawn(bucket_lease_cr.get(), false);
while (!bucket_lease_cr->is_locked()) {
if (bucket_lease_cr->is_done()) {
b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests");
b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors");
+ b.add_time_avg(l_lock, "lock_latency", "Average latency of sync lock operations");
+
auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
cct->get_perfcounters_collection()->add(logger.get());
return logger;
l_poll,
l_poll_err,
+ l_lock,
+
l_last,
};