From 55886eac960cfc6f311fb4c9812311c420761793 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Mon, 27 Feb 2023 17:39:25 -0700 Subject: [PATCH] dispatcher: Do not time out when locking machines Signed-off-by: Zack Cerza --- teuthology/dispatcher/__init__.py | 9 +++++++-- teuthology/lock/ops.py | 11 ++++++++--- teuthology/lock/query.py | 8 ++++++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py index 273f747e2..9aea132dd 100644 --- a/teuthology/dispatcher/__init__.py +++ b/teuthology/dispatcher/__init__.py @@ -220,8 +220,13 @@ def find_dispatcher_processes(machine_type): def lock_machines(job_config): report.try_push_job_info(job_config, dict(status='running')) fake_ctx = supervisor.create_fake_context(job_config, block=True) - block_and_lock_machines(fake_ctx, len(job_config['roles']), - job_config['machine_type'], reimage=False) + block_and_lock_machines( + fake_ctx, + len(job_config['roles']), + job_config['machine_type'], + tries=-1, + reimage=False, + ) job_config = fake_ctx.config return job_config diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index b0c7d8033..5ab995ad7 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -327,7 +327,7 @@ def reimage_machines(ctx, machines, machine_type): return reimaged -def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True): +def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tries=10): # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and @@ -347,8 +347,13 @@ def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True): requested = total_requested while True: # get a candidate list of machines - machines = query.list_locks(machine_type=machine_type, up=True, - locked=False, count=requested + reserved) + machines = query.list_locks( + machine_type=machine_type, + up=True, + locked=False, + count=requested + reserved, + tries=tries, + ) if machines is None: if ctx.block: log.error('Error listing machines, trying again') diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py index bb1044c2b..9fd09d9ab 100644 --- a/teuthology/lock/query.py +++ b/teuthology/lock/query.py @@ -51,7 +51,7 @@ def is_vm(name=None, status=None): return status.get('is_vm', False) -def list_locks(keyed_by_name=False, **kwargs): +def list_locks(keyed_by_name=False, tries=10, **kwargs): uri = os.path.join(config.lock_server, 'nodes', '') for key, value in kwargs.items(): if kwargs[key] is False: @@ -63,7 +63,11 @@ def list_locks(keyed_by_name=False, **kwargs): kwargs['machine_type'] = kwargs['machine_type'].replace(',','|') uri += '?' + urlencode(kwargs) with safe_while( - sleep=1, increment=0.5, action='list_locks') as proceed: + sleep=1, + increment=0.5, + tries=-1, + action='list_locks' + ) as proceed: while proceed(): try: response = requests.get(uri) -- 2.47.3