From 0aca695f819df75a7d7064b20c6205481f9c17e1 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 28 Oct 2024 12:51:16 +0100 Subject: [PATCH] Make vm_id assignment more robust Remove the counter way to assign a vm_id as it didn't work reliably Jira ticket: ALEPH-272 That method was broken when persitent instances were loaded at start up. Since the "new" feature that allow persistent instance across aleph-vm reboot if one was started then aleph-vm was stopped and restarted the counter method could reassign the ip and break the existing vm's. Secundary reason was that the feature wasn't working properly with the default settings, as `2**available_bits` returned 1. So that code path was only used if the node owner tweaked some undocumented settings making it hard to identify and debug in prod nodes. --- src/aleph/vm/pool.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 3ecf500e..025bfe45 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -28,15 +28,13 @@ class VmPool: - """Pool of VMs already started and used to decrease response time. + """Pool of existing VMs + + For function VM we keep the VM a while after they have run, so we can reuse them and thus decrease response time. After running, a VM is saved for future reuse from the same function during a configurable duration. - - The counter is used by the VMs to set their tap interface name and the corresponding - IPv4 subnet. """ - counter: int # Used to provide distinct ids to network interfaces executions: dict[ItemHash, VmExecution] message_cache: dict[str, ExecutableMessage] network: Network | None @@ -45,7 +43,6 @@ class VmPool: creation_lock: asyncio.Lock def __init__(self, loop: asyncio.AbstractEventLoop): - self.counter = settings.START_ID_INDEX self.executions = {} self.message_cache = {} @@ -150,25 +147,13 @@ def get_unique_vm_id(self) -> int: This identifier is used to name the network interface and in the IPv4 range dedicated to the VM. """ - _, network_range = settings.IPV4_ADDRESS_POOL.split("/") - available_bits = int(network_range) - settings.IPV4_NETWORK_PREFIX_LENGTH - self.counter += 1 - if self.counter < 2**available_bits: - # In common cases, use the counter itself as the vm_id. This makes it - # easier to debug. - return self.counter - else: - # The value of the counter is too high and some functions such as the - # IPv4 range dedicated to the VM do not support such high values. - # - # We therefore recycle vm_id values from executions that are not running - # anymore. - currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()} - for i in range(settings.START_ID_INDEX, 255**2): - if i not in currently_used_vm_ids: - return i - msg = "No available value for vm_id." - raise ValueError(msg) + # Take the first id that is not already taken + currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()} + for i in range(settings.START_ID_INDEX, 255**2): + if i not in currently_used_vm_ids: + return i + msg = "No available value for vm_id." + raise ValueError(msg) def get_running_vm(self, vm_hash: ItemHash) -> VmExecution | None: """Return a running VM or None. Disables the VM expiration task."""