From ebe7b2e3703ed7be3fef42bbbe5c629cdbfc8442 Mon Sep 17 00:00:00 2001 From: "Andres D. Molins" Date: Mon, 31 Jul 2023 16:03:09 +0200 Subject: [PATCH] Fix: Ensure that one VM failed execution don't stop the allocation process. --- firecracker/microvm.py | 2 +- vm_supervisor/run.py | 45 ++++++++++++++++++++++++++---------------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/firecracker/microvm.py b/firecracker/microvm.py index da3299e89..da0088453 100644 --- a/firecracker/microvm.py +++ b/firecracker/microvm.py @@ -469,7 +469,7 @@ async def teardown(self): await asyncio.sleep(1) root_fs = self.mounted_rootfs.name system(f"dmsetup remove {root_fs}") - if self.use_jailer: + if self.use_jailer and Path(self.jailer_path).is_dir(): shutil.rmtree(self.jailer_path) if self._unix_socket: diff --git a/vm_supervisor/run.py b/vm_supervisor/run.py index 45a16b349..209f29c9a 100644 --- a/vm_supervisor/run.py +++ b/vm_supervisor/run.py @@ -49,6 +49,7 @@ async def build_event_scope(event) -> Dict[str, Any]: async def create_vm_execution(vm_hash: ItemHash) -> VmExecution: + execution: Optional[VmExecution] = None message, original_message = await load_updated_message(vm_hash) pool.message_cache[vm_hash] = message @@ -79,11 +80,8 @@ async def create_vm_execution(vm_hash: ItemHash) -> VmExecution: except HostNotFoundError as error: logger.exception(error) pool.forget_vm(vm_hash=vm_hash) - raise HTTPInternalServerError( - reason="Error during vm initialisation, vm ping without response" - ) - if not execution.vm: + if not execution or execution.vm: raise ValueError("The VM has not been created") return execution @@ -234,21 +232,28 @@ async def run_code_on_event(vm_hash: ItemHash, event, pubsub: PubSub): await execution.stop() -async def start_persistent_vm(vm_hash: ItemHash, pubsub: PubSub) -> VmExecution: +async def start_persistent_vm( + vm_hash: ItemHash, pubsub: PubSub +) -> Optional[VmExecution]: execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) - if not execution: - logger.info(f"Starting persistent virtual machine with id: {vm_hash}") - execution = await create_vm_execution(vm_hash=vm_hash) - # If the VM was already running in lambda mode, it should not expire - # as long as it is also scheduled as long-running - execution.persistent = True - execution.cancel_expiration() + try: + if not execution: + logger.info(f"Starting persistent virtual machine with id: {vm_hash}") + execution = await create_vm_execution(vm_hash=vm_hash) + # If the VM was already running in lambda mode, it should not expire + # as long as it is also scheduled as long-running + execution.persistent = True + execution.cancel_expiration() + + await execution.becomes_ready() - await execution.becomes_ready() + if settings.WATCH_FOR_UPDATES: + execution.start_watching_for_updates(pubsub=pubsub) - if settings.WATCH_FOR_UPDATES: - execution.start_watching_for_updates(pubsub=pubsub) + # TODO: Handle all the exceptions, for now Always return a 200 code for now + except: + pass return execution @@ -256,6 +261,12 @@ async def start_persistent_vm(vm_hash: ItemHash, pubsub: PubSub) -> VmExecution: async def stop_persistent_vm(vm_hash: ItemHash) -> Optional[VmExecution]: logger.info(f"Stopping persistent VM {vm_hash}") execution = await pool.get_running_vm(vm_hash) - if execution: - await execution.stop() + + try: + if execution: + await execution.stop() + # TODO: Handle all the exceptions, for now Always return a 200 code for now + except: + pass + return execution