From ebe7b2e3703ed7be3fef42bbbe5c629cdbfc8442 Mon Sep 17 00:00:00 2001
From: "Andres D. Molins" <amolinsdiaz@yahoo.es>
Date: Mon, 31 Jul 2023 16:03:09 +0200
Subject: [PATCH] Fix: Ensure that one VM failed execution don't stop the
 allocation process.

---
 firecracker/microvm.py |  2 +-
 vm_supervisor/run.py   | 45 ++++++++++++++++++++++++++----------------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/firecracker/microvm.py b/firecracker/microvm.py
index da3299e89..da0088453 100644
--- a/firecracker/microvm.py
+++ b/firecracker/microvm.py
@@ -469,7 +469,7 @@ async def teardown(self):
             await asyncio.sleep(1)
             root_fs = self.mounted_rootfs.name
             system(f"dmsetup remove {root_fs}")
-            if self.use_jailer:
+            if self.use_jailer and Path(self.jailer_path).is_dir():
                 shutil.rmtree(self.jailer_path)
 
         if self._unix_socket:
diff --git a/vm_supervisor/run.py b/vm_supervisor/run.py
index 45a16b349..209f29c9a 100644
--- a/vm_supervisor/run.py
+++ b/vm_supervisor/run.py
@@ -49,6 +49,7 @@ async def build_event_scope(event) -> Dict[str, Any]:
 
 
 async def create_vm_execution(vm_hash: ItemHash) -> VmExecution:
+    execution: Optional[VmExecution] = None
     message, original_message = await load_updated_message(vm_hash)
     pool.message_cache[vm_hash] = message
 
@@ -79,11 +80,8 @@ async def create_vm_execution(vm_hash: ItemHash) -> VmExecution:
     except HostNotFoundError as error:
         logger.exception(error)
         pool.forget_vm(vm_hash=vm_hash)
-        raise HTTPInternalServerError(
-            reason="Error during vm initialisation, vm ping without response"
-        )
 
-    if not execution.vm:
+    if not execution or execution.vm:
         raise ValueError("The VM has not been created")
 
     return execution
@@ -234,21 +232,28 @@ async def run_code_on_event(vm_hash: ItemHash, event, pubsub: PubSub):
             await execution.stop()
 
 
-async def start_persistent_vm(vm_hash: ItemHash, pubsub: PubSub) -> VmExecution:
+async def start_persistent_vm(
+    vm_hash: ItemHash, pubsub: PubSub
+) -> Optional[VmExecution]:
     execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash)
 
-    if not execution:
-        logger.info(f"Starting persistent virtual machine with id: {vm_hash}")
-        execution = await create_vm_execution(vm_hash=vm_hash)
-    # If the VM was already running in lambda mode, it should not expire
-    # as long as it is also scheduled as long-running
-    execution.persistent = True
-    execution.cancel_expiration()
+    try:
+        if not execution:
+            logger.info(f"Starting persistent virtual machine with id: {vm_hash}")
+            execution = await create_vm_execution(vm_hash=vm_hash)
+        # If the VM was already running in lambda mode, it should not expire
+        # as long as it is also scheduled as long-running
+        execution.persistent = True
+        execution.cancel_expiration()
+
+        await execution.becomes_ready()
 
-    await execution.becomes_ready()
+        if settings.WATCH_FOR_UPDATES:
+            execution.start_watching_for_updates(pubsub=pubsub)
 
-    if settings.WATCH_FOR_UPDATES:
-        execution.start_watching_for_updates(pubsub=pubsub)
+    # TODO: Handle all the exceptions, for now Always return a 200 code for now
+    except:
+        pass
 
     return execution
 
@@ -256,6 +261,12 @@ async def start_persistent_vm(vm_hash: ItemHash, pubsub: PubSub) -> VmExecution:
 async def stop_persistent_vm(vm_hash: ItemHash) -> Optional[VmExecution]:
     logger.info(f"Stopping persistent VM {vm_hash}")
     execution = await pool.get_running_vm(vm_hash)
-    if execution:
-        await execution.stop()
+
+    try:
+        if execution:
+            await execution.stop()
+    # TODO: Handle all the exceptions, for now Always return a 200 code for now
+    except:
+        pass
+
     return execution