From 003ffe169f73e5c6c85d35100d6eec4ad5a9c513 Mon Sep 17 00:00:00 2001 From: Marc Schoechlin Date: Sat, 25 May 2019 13:05:16 +0200 Subject: [PATCH] tools/histogram_dump: improve io-latency and -size statistics The script is part of the ceph distribution but not part of the official tooling. Nevertheless it was useful to understand the usage scenarios of my osds. - add colors to the output to ease interpretation - add possibility to specify one ore more sockets - aggregate statistics over multiple sockets - prevent stacktrace if tool is interrupted by SIGINT - add possibility to specify the number of data collection seconds between loops - raise loop_seconds default Signed-off-by: Marc Schoechlin --- src/tools/histogram_dump.py | 128 ++++++++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 29 deletions(-) diff --git a/src/tools/histogram_dump.py b/src/tools/histogram_dump.py index bafc24b00ea..cc22fef5e96 100755 --- a/src/tools/histogram_dump.py +++ b/src/tools/histogram_dump.py @@ -1,9 +1,10 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 # # Ceph - scalable distributed file system # # Copyright (C) 2017 OVH +# Copyright (C) 2020 Marc Schöchlin # # This is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public @@ -16,6 +17,10 @@ import subprocess import time import os import argparse +import glob +import sys +import textwrap +import datetime def shorten(val): @@ -27,20 +32,30 @@ def shorten(val): return val -def print_histogram(asok, logger, counter, last): +def create_histogram(sockets, counter, last, seconds, batch): + + current_datasets = {} + json_d = {} + for socket in sockets: + try: + out = subprocess.check_output( + "ceph --admin-daemon {} perf histogram dump".format(socket), + shell=True) + json_d = json.loads(out.decode('utf-8')) + except Exception as e: + return (last, + "Couldn't connect to admin socket, result: \n{}".format(e)) + current_datasets[socket] = json_d['osd'][counter]['values'] - try: - out = subprocess.check_output( - "ceph --admin-daemon {} perf histogram dump".format(asok), - shell=True) - j = json.loads(out.decode('utf-8')) - except Exception as e: - return (last, - "Couldn't connect to admin socket, result: \n{}".format(e)) - - current = j['osd'][counter]['values'] - axes = j['osd'][counter]['axes'] - content = "" + + axes = json_d['osd'][counter]['axes'] + + if batch: + content = "{} : Counter: {} for {}\n\n\n".format( + datetime.datetime.now().isoformat(), counter,", ".join(sockets)) + else: + content = "Counter: {} for {}\n(create statistics every {} seconds)\n\n".format( + counter,", ".join(sockets),seconds) content += "{}:\n".format(axes[1]['name']) for r in axes[1]['ranges']: @@ -55,13 +70,38 @@ def print_histogram(asok, logger, counter, last): content += ("{0: >"+str(len(axes[1]['ranges'])*5+14)+"}:\n").format( axes[0]['name']) + if batch: + COL = '' + ENDC = '' + else: + COL = '\033[91m' + ENDC = '\033[0m' + + current = [] + + # initalize with zeros + for i in range(len(current_datasets[socket])): + current.append([]) + for j in range(len(current_datasets[socket][i])): + current[i].append(0) + + # combine data + for socket, data in current_datasets.items(): + for i in range(len(data)): + for j in range(len(data[i])): + current[i][j] += data[i][j] + for i in range(len(current)): for j in range(len(current[i])): try: diff = current[i][j] - last[i][j] except IndexError: diff = '-' - content += "{0: >4} ".format(shorten(diff)) + + if diff != "-" and diff != 0: + content += "{0}{1: >4}{2} ".format(COL,shorten(diff),ENDC) + else: + content += "{0: >4} ".format(shorten(diff)) r = axes[0]['ranges'][i] content += "{0: >6} : {1}\n".format( @@ -70,35 +110,65 @@ def print_histogram(asok, logger, counter, last): return (current, content) -def loop_print(asok, logger, counter): +def loop_print(sockets, counter, loop_seconds, batch): last = [] - while True: - last, content = print_histogram(asok, logger, counter, last) - print("{}{}".format("\n"*100, content)) - time.sleep(1) + try: + while True: + last, content = create_histogram(sockets, counter, last, loop_seconds, batch) + if not batch: + print(chr(27) + "[2J") + print(content) + time.sleep(loop_seconds) + except KeyboardInterrupt: + print("...interupted") + sys.exit(0) def main(): parser = argparse.ArgumentParser( - description='Continuously display ceph performance histogram') + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='Continuously display ceph performance histogram for selected osd operations') parser.add_argument( '--asok', type=str, - default='/var/run/ceph/*.asok', - help='Path to asok file, can use wildcards') - parser.add_argument( - '--logger', - type=str, - default='osd') + default=['/var/run/ceph/*.asok'], + nargs='+', + help='Path to asok file, you can use wildcards') parser.add_argument( '--counter', type=str, + help=textwrap.dedent('''\ + Specify name of the counter to calculate statistics + see "ceph --admin-daemon /var/run/ceph/.asok perf histogram dump" + '''), default='op_w_latency_in_bytes_histogram') - args = parser.parse_args() + parser.add_argument( + '--batch', + help='Disable colors and add timestamps', + action='store_true', + ) + parser.add_argument( + '--loop_seconds', + type=int, + help='Cycle time in seconds for statistics generation', + default=5) - loop_print(args.asok, args.logger, args.counter) + args = parser.parse_args() + if not sys.stdout.isatty(): + print("Not running with a tty, automatically switching to batch mode") + args.batch = True + + sockets = [] + for asok in args.asok: + sockets = glob.glob(asok) + sockets + + if len(sockets) == 0: + print("no suitable socket at {}".format(args.asok)) + sys.exit(1) + + loop_print(sockets, args.counter, args.loop_seconds, args.batch) if __name__ == '__main__': main() -- 2.39.5