From 0aca695f819df75a7d7064b20c6205481f9c17e1 Mon Sep 17 00:00:00 2001
From: Olivier Le Thanh Duong <olivier@lethanh.be>
Date: Mon, 28 Oct 2024 12:51:16 +0100
Subject: [PATCH] Make vm_id assignment more robust

Remove the counter way to assign a vm_id as it didn't work reliably

Jira ticket: ALEPH-272

That method was broken when persitent instances were loaded at start up.
Since the "new" feature that allow persistent instance across aleph-vm reboot
if one was started then aleph-vm was stopped and restarted the counter method
could reassign the ip and break the existing vm's.

Secundary reason was that the feature wasn't working properly with the default
settings, as `2**available_bits` returned 1. So that code path was only used if
the node owner tweaked some undocumented settings making it hard to identify and
debug in prod nodes.
---
 src/aleph/vm/pool.py | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py
index 3ecf500e..025bfe45 100644
--- a/src/aleph/vm/pool.py
+++ b/src/aleph/vm/pool.py
@@ -28,15 +28,13 @@
 
 
 class VmPool:
-    """Pool of VMs already started and used to decrease response time.
+    """Pool of existing VMs
+
+    For function VM we keep the VM a while after they  have run, so we can reuse them  and thus decrease response time.
     After running, a VM is saved for future reuse from the same function during a
     configurable duration.
-
-    The counter is used by the VMs to set their tap interface name and the corresponding
-    IPv4 subnet.
     """
 
-    counter: int  # Used to provide distinct ids to network interfaces
     executions: dict[ItemHash, VmExecution]
     message_cache: dict[str, ExecutableMessage]
     network: Network | None
@@ -45,7 +43,6 @@ class VmPool:
     creation_lock: asyncio.Lock
 
     def __init__(self, loop: asyncio.AbstractEventLoop):
-        self.counter = settings.START_ID_INDEX
         self.executions = {}
         self.message_cache = {}
 
@@ -150,25 +147,13 @@ def get_unique_vm_id(self) -> int:
         This identifier is used to name the network interface and in the IPv4 range
         dedicated to the VM.
         """
-        _, network_range = settings.IPV4_ADDRESS_POOL.split("/")
-        available_bits = int(network_range) - settings.IPV4_NETWORK_PREFIX_LENGTH
-        self.counter += 1
-        if self.counter < 2**available_bits:
-            # In common cases, use the counter itself as the vm_id. This makes it
-            # easier to debug.
-            return self.counter
-        else:
-            # The value of the counter is too high and some functions such as the
-            # IPv4 range dedicated to the VM do not support such high values.
-            #
-            # We therefore recycle vm_id values from executions that are not running
-            # anymore.
-            currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
-            for i in range(settings.START_ID_INDEX, 255**2):
-                if i not in currently_used_vm_ids:
-                    return i
-            msg = "No available value for vm_id."
-            raise ValueError(msg)
+        # Take the first id that is not already taken
+        currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
+        for i in range(settings.START_ID_INDEX, 255**2):
+            if i not in currently_used_vm_ids:
+                return i
+        msg = "No available value for vm_id."
+        raise ValueError(msg)
 
     def get_running_vm(self, vm_hash: ItemHash) -> VmExecution | None:
         """Return a running VM or None. Disables the VM expiration task."""