From: Brad Hubbard Date: Thu, 15 Jun 2017 01:40:09 +0000 (+1000) Subject: osd: Log audit X-Git-Tag: v12.1.2~62^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f8acc53d820c6535037237d205d6ce2d4829fe6c;p=ceph.git osd: Log audit Review current log messages for consistency, accuracy and necessesity as part of usability initiative. First in a series. Signed-off-by: Brad Hubbard --- diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 14b933233fa..28db9476b1b 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -2615,7 +2615,7 @@ function TEST_periodic_scrub_replicated() { pg_scrub $pg sleep 1 set_config osd ${primary} osd_scrub_backoff_ratio $scrub_backoff_ratio - grep -q "Regular scrub request, losing deep-scrub details" $dir/osd.${primary}.log || return 1 + grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1 # deep-scrub error is no longer present rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1 diff --git a/qa/suites/rados/basic/tasks/repair_test.yaml b/qa/suites/rados/basic/tasks/repair_test.yaml index 8401c1a303f..9afbc0417b0 100644 --- a/qa/suites/rados/basic/tasks/repair_test.yaml +++ b/qa/suites/rados/basic/tasks/repair_test.yaml @@ -16,7 +16,7 @@ overrides: - scrub [0-9]+ errors - 'size 1 != size' - attr name mismatch - - Regular scrub request, losing deep-scrub details + - Regular scrub request, deep-scrub details will be lost - overall HEALTH_ - (OSDMAP_FLAGS) - (OSD_ diff --git a/qa/tasks/repair_test.py b/qa/tasks/repair_test.py index 26851e25e3d..473af088b83 100644 --- a/qa/tasks/repair_test.py +++ b/qa/tasks/repair_test.py @@ -275,7 +275,7 @@ def task(ctx, config): - 'scrub [0-9]+ errors' - 'size 1 != size' - 'attr name mismatch' - - 'Regular scrub request, losing deep-scrub details' + - 'Regular scrub request, deep-scrub details will be lost' conf: osd: filestore debug inject read err: true diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index e03768183d4..fccb834ef54 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -996,7 +996,8 @@ void ECBackend::handle_sub_read( hinfo = get_hash_info(i->first); if (!hinfo) { r = -EIO; - get_parent()->clog_error() << __func__ << ": No hinfo for " << i->first; + get_parent()->clog_error() << "Corruption detected: object " << i->first + << " is missing hash_info"; dout(5) << __func__ << ": No hinfo for " << i->first << dendl; goto error; } @@ -1010,9 +1011,8 @@ void ECBackend::handle_sub_read( j->get<1>(), bl, j->get<2>()); if (r < 0) { - get_parent()->clog_error() << __func__ - << ": Error " << r - << " reading " + get_parent()->clog_error() << "Error " << r + << " reading object " << i->first; dout(5) << __func__ << ": Error " << r << " reading " << i->first << dendl; @@ -1038,7 +1038,7 @@ void ECBackend::handle_sub_read( bufferhash h(-1); h << bl; if (h.digest() != hinfo->get_chunk_hash(shard)) { - get_parent()->clog_error() << __func__ << ": Bad hash for " << i->first << " digest 0x" + get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x" << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec; dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x" << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl; @@ -1235,7 +1235,7 @@ void ECBackend::handle_sub_read_reply( err = rop.complete[iter->first].errors.begin()->second; rop.complete[iter->first].r = err; } else { - get_parent()->clog_error() << __func__ << ": Error(s) ignored for " + get_parent()->clog_warn() << "Error(s) ignored for " << iter->first << " enough copies available"; dout(10) << __func__ << " Error(s) ignored for " << iter->first << " enough copies available" << dendl; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index b29f9b87a4c..af7914749ff 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -820,11 +820,11 @@ void OSDService::check_full_status(float ratio) dout(10) << __func__ << " " << get_full_state_name(cur_state) << " -> " << get_full_state_name(new_state) << dendl; if (new_state == FAILSAFE) { - clog->error() << "failsafe engaged, dropping updates, now " + clog->error() << "full status failsafe engaged, dropping updates, now " << (int)roundf(ratio * 100) << "% full"; } else if (cur_state == FAILSAFE) { - clog->error() << "failsafe disengaged, no longer dropping updates, now " - << (int)roundf(ratio * 100) << "% full"; + clog->error() << "full status failsafe disengaged, no longer dropping " + << "updates, now " << (int)roundf(ratio * 100) << "% full"; } cur_state = new_state; } @@ -7819,7 +7819,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) } else if (!osdmap->get_hb_back_addr(whoami).probably_equals( hb_back_server_messenger->get_myaddr())) { clog->error() << "map e" << osdmap->get_epoch() - << " had wrong hb back addr (" + << " had wrong heartbeat back addr (" << osdmap->get_hb_back_addr(whoami) << " != my " << hb_back_server_messenger->get_myaddr() << ")"; @@ -7827,7 +7827,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) !osdmap->get_hb_front_addr(whoami).probably_equals( hb_front_server_messenger->get_myaddr())) { clog->error() << "map e" << osdmap->get_epoch() - << " had wrong hb front addr (" + << " had wrong heartbeat front addr (" << osdmap->get_hb_front_addr(whoami) << " != my " << hb_front_server_messenger->get_myaddr() << ")"; diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 5153d5d8f32..5b620762122 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -3340,8 +3340,8 @@ void PG::log_weirdness() // sloppy check if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail())) osd->clog->error() << info.pgid - << " log bound mismatch, info (" << pg_log.get_tail() << "," - << pg_log.get_head() << "]" + << " log bound mismatch, info (tail,head] (" + << pg_log.get_tail() << "," << pg_log.get_head() << "]" << " actual [" << pg_log.get_log().log.begin()->version << "," << pg_log.get_log().log.rbegin()->version << "]"; @@ -3623,7 +3623,7 @@ bool PG::sched_scrub() } else { osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid - << " Regular scrub request, losing deep-scrub details"; + << " Regular scrub request, deep-scrub details will be lost"; } } queue_scrub(); @@ -7129,9 +7129,12 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&) pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) { if (pg->cct->_conf->osd_auto_mark_unfound_lost) { pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound - << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED"; + << " objects unfound and apparently lost, would automatically " + << "mark these objects lost but this feature is not yet implemented " + << "(osd_auto_mark_unfound_lost)"; } else - pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost"; + pg->osd->clog->error() << pg->info.pgid.pgid << " has " + << unfound << " objects unfound and apparently lost"; } if (pg->is_active()) { diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 90275c66425..f244f4c2f70 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -276,6 +276,7 @@ typedef ceph::shared_ptr OSDMapRef; virtual ceph_tid_t get_tid() = 0; virtual LogClientTemp clog_error() = 0; + virtual LogClientTemp clog_warn() = 0; virtual bool check_failsafe_full(ostream &ss) = 0; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 7c22976ac36..0d21cb91a54 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -3712,22 +3712,22 @@ int PrimaryLogPG::trim_object( } else { auto p = snapset.clone_snaps.find(coid.snap); if (p == snapset.clone_snaps.end()) { - osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset - << " for " << coid << "\n"; + osd->clog->error() << "No clone_snaps in snapset " << snapset + << " for object " << coid << "\n"; return -ENOENT; } old_snaps.insert(snapset.clone_snaps[coid.snap].begin(), snapset.clone_snaps[coid.snap].end()); } if (old_snaps.empty()) { - osd->clog->error() << __func__ << " No object info snaps for " << coid; + osd->clog->error() << "No object info snaps for object " << coid; return -ENOENT; } dout(10) << coid << " old_snaps " << old_snaps << " old snapset " << snapset << dendl; if (snapset.seq == 0) { - osd->clog->error() << __func__ << " No snapset.seq for " << coid; + osd->clog->error() << "No snapset.seq for object " << coid; return -ENOENT; } @@ -3744,7 +3744,7 @@ int PrimaryLogPG::trim_object( if (new_snaps.empty()) { p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap); if (p == snapset.clones.end()) { - osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones"; + osd->clog->error() << "Snap " << coid.snap << " not in clones"; return -ENOENT; } } @@ -11379,15 +11379,16 @@ bool PrimaryLogPG::start_recovery_ops( if (missing.num_missing() > 0) { // this shouldn't happen! - osd->clog->error() << info.pgid << " recovery ending with " << missing.num_missing() - << ": " << missing.get_items(); + osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with " + << missing.num_missing() << ": " << missing.get_items(); return work_in_progress; } if (needs_recovery()) { // this shouldn't happen! // We already checked num_missing() so we must have missing replicas - osd->clog->error() << info.pgid << " recovery ending with missing replicas"; + osd->clog->error() << info.pgid + << " Unexpected Error: recovery ending with missing replicas"; return work_in_progress; } @@ -13623,7 +13624,8 @@ unsigned PrimaryLogPG::process_clones_to(const boost::optional &head, if (!allow_incomplete_clones) { next_clone.snap = **curclone; clog->error() << mode << " " << pgid << " " << head.get() - << " expected clone " << next_clone; + << " expected clone " << next_clone << " " << missing + << " missing"; ++scrubber.shallow_errors; e.set_clone_missing(next_clone.snap); } @@ -14002,7 +14004,7 @@ void PrimaryLogPG::scrub_snapshot_metadata( ObjectContextRef obc = get_object_context(p->first, false); if (!obc) { osd->clog->error() << info.pgid << " " << mode - << " cannot get object context for " + << " cannot get object context for object " << p->first; continue; } else if (obc->obs.oi.soid != p->first) { @@ -14059,7 +14061,7 @@ void PrimaryLogPG::scrub_snapshot_metadata( ObjectContextRef obc = get_object_context(p.first, true); if (!obc) { osd->clog->error() << info.pgid << " " << mode - << " cannot get object context for " + << " cannot get object context for object " << p.first; continue; } else if (obc->obs.oi.soid != p.first) { diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index c914ba2c76c..328fb1ba236 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -460,6 +460,7 @@ public: ceph_tid_t get_tid() override { return osd->get_tid(); } LogClientTemp clog_error() override { return osd->clog->error(); } + LogClientTemp clog_warn() override { return osd->clog->warn(); } struct watch_disconnect_t { uint64_t cookie;