Skip to content

Commit

Permalink
QEmu support continuation (#490)
Browse files Browse the repository at this point in the history
continuation of #487 

Allow launching VM Instance via QEMU instead of firecracker.

This works by adding a new Controller for Qemu alongside  AlephFirecrackerProgram and AlephFirecrackerInstance and launch it if the message == Instance + hypervisor == qemu.

I'm opening this so we can get the review and discussion started but from discussion I understand it won't be for the next release since we are focusing on bugfixes for now. I can clean it up the git history afterward if needed. Please play with it plenty to find any problem I might have missed.

There is a corresponding PR in aleph-message: aleph-im/aleph-message#78


## how to test
See this pretty complete readme on how to test it https://github.com/aleph-im/aleph-vm/blob/qemu_support/src/aleph/vm/controllers/qemu/QEMU.md

Necessary change in aleph-message were released in 0.4.1

## Modification to the code

I had to make a few change outside the Qemu controller itself to provide compatibility between all controller:

- New abstract class: AlephControllerInterface which define the shared interface between Firecracker and Qemu controllers for sharing and typing.
- Add field `support_snapshot` on controller so the controler can declare support to the SnapShotManager without the different guessing from the method we had till now.
- a Mixin to manage the cloud init config, I intended to have it used between all the controllers that need it  but at the moment I had to tweak the cloud init configuration so it's not done yet
- `get_log_queue` and `unregister_queue` so the operator can register to the Log queues without knowing the internal logic of the VM (which is different since Qemu don't use MicroVM


Refer to QEMU.md for a list of supported feature at the moment.
IMHO the main thing missing is automated testing.
moment I had to tweak the cloud init configuration so it's not done yet
    get_log_queue and unregister_queue so the operator can register to the Log queues without knowing the internal logic of the VM (which is different since Qemu don't use MicroVM

Refer to QEMU.md for a list of supported feature at the moment.
IMHO the main thing missing is automated testing.
  • Loading branch information
olethanh authored Dec 5, 2023
1 parent fd2b102 commit ef8ae0b
Show file tree
Hide file tree
Showing 24 changed files with 941 additions and 68 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/test-new-runtime-examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ jobs:
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/about/usage/system"
curl --retry 5 --max-time 10 --fail "http://${DROPLET_IPV4}:4020/status/check/fastapi"
- name: Export aleph logs
if: always()
run: |
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-runtime --output json | ./.github/scripts/extract_droplet_ipv4.py)"
ssh root@${DROPLET_IPV4} "journalctl -u aleph-vm-supervisor"
- name: Cleanup
if: always()
run: |
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/test-on-droplets-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,12 @@ jobs:
-d '{"persistent_vms": [], "instances": ["${{ matrix.check_vm.item_hash }}"]}' \
"http://${DROPLET_IPV4}:4020/control/allocations"
- name: Export aleph logs
if: always()
run: |
export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)"
ssh root@${DROPLET_IPV4} "journalctl -u aleph-vm-supervisor"
- name: Cleanup
if: always()
run: |
Expand Down
2 changes: 1 addition & 1 deletion docker/vm_supervisor-dev.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ RUN curl -fsSL -o /opt/firecracker/vmlinux.bin https://s3.amazonaws.com/spec.ccf
RUN ln /opt/firecracker/release-*/firecracker-v* /opt/firecracker/firecracker
RUN ln /opt/firecracker/release-*/jailer-v* /opt/firecracker/jailer

RUN pip3 install typing-extensions 'aleph-message==0.4.0'
RUN pip3 install typing-extensions 'aleph-message==0.4.1'

RUN mkdir -p /var/lib/aleph/vm/jailer

Expand Down
66 changes: 66 additions & 0 deletions examples/qemu_message_from_aleph.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"chain": "ETH",
"item_hash": "fake-hash-fake-hash-fake-hash-fake-hash-fake-hash-fake-hash-hash",
"sender": "0x9319Ad3B7A8E0eE24f2E639c40D8eD124C5520Ba",
"type": "INSTANCE",
"channel": "Fun-dApps",
"confirmed": true,
"content": {
"address": "0x9319Ad3B7A8E0eE24f2E639c40D8eD124C5520Ba",
"allow_amend": false,
"variables": {
"VM_CUSTOM_NUMBER": "32"
},
"environment": {
"reproducible": true,
"internet": true,
"aleph_api": true,
"shared_cache": true,
"hypervisor": "qemu"
},
"resources": {
"vcpus": 1,
"memory": 512,
"seconds": 30
},
"rootfs": {
"parent": {
"ref": "549ec451d9b099cad112d4aaa2c00ac40fb6729a92ff252ff22eef0b5c3cb613",
"use_latest": false
},
"persistence": "host",
"size_mib": 5000
},
"authorized_keys": [
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDj95BHGUx0/z2G/tTrEi8o49i70xvjcEUdSs3j4A33jE7pAphrfRVbuFMgFubcm8n9r5ftd/H8SjjTL4hY9YvWV5ZuMf92GUga3n4wgevvPlBszYZCy/idxFl0vtHYC1CcK9v4tVb9onhDt8FOJkf2m6PmDyvC+6tl6LwoerXTeeiKr5VnTB4KOBkammtFmix3d1X1SZd/cxdwZIHcQ7BNsqBm2w/YzVba6Z4ZnFUelBkQtMQqNs2aV51O1pFFqtZp2mM71D5d8vn9pOtqJ5QmY5IW6NypcyqKJZg5o6QguK5rdXLkc7AWro27BiaHIENl3w0wazp9EDO9zPAGJ6lz olivier@lanius"
],
"volumes": [
{
"mount": "/opt/venv",
"ref": "5f31b0706f59404fad3d0bff97ef89ddf24da4761608ea0646329362c662ba51",
"use_latest": false
},
{
"comment": "Working data persisted on the VM supervisor, not available on other nodes",
"mount": "/var/lib/example",
"name": "data",
"persistence": "host",
"size_mib": 5
}
],
"replaces": "0x9319Ad3B7A8E0eE24f2E639c40D8eD124C5520Ba",
"time": 1619017773.8950517
},
"item_content": "{\"address\":\"0x9319Ad3B7A8E0eE24f2E639c40D8eD124C5520Ba\",\"allow_amend\":false,\"variables\":{\"VM_CUSTOM_NUMBER\":\"32\"},\"environment\":{\"reproducible\":true,\"internet\":true,\"aleph_api\":true,\"shared_cache\":true},\"resources\":{\"vcpus\":1,\"memory\":128,\"seconds\":30},\"rootfs\":{\"parent\":{\"ref\":\"549ec451d9b099cad112d4aaa2c00ac40fb6729a92ff252ff22eef0b5c3cb613\",\"use_latest\":true},\"persistence\":\"host\",\"size_mib\":20000},\"cloud_config\":{\"password\":\"password\",\"chpasswd\":{\"expire\":\"False\"}},\"volumes\":[{\"mount\":\"/opt/venv\",\"ref\":\"5f31b0706f59404fad3d0bff97ef89ddf24da4761608ea0646329362c662ba51\",\"use_latest\":false},{\"comment\":\"Working data persisted on the VM supervisor, not available on other nodes\",\"mount\":\"/var/lib/example\",\"name\":\"data\",\"persistence\":\"host\",\"size_mib\":5}],\"replaces\":\"0x9319Ad3B7A8E0eE24f2E639c40D8eD124C5520Ba\",\"time\":1619017773.8950517}",
"item_type": "inline",
"signature": "0x372da8230552b8c3e65c05b31a0ff3a24666d66c575f8e11019f62579bf48c2b7fe2f0bbe907a2a5bf8050989cdaf8a59ff8a1cbcafcdef0656c54279b4aa0c71b",
"size": 749,
"time": 1619017773.8950577,
"confirmations": [
{
"chain": "ETH",
"height": 12284734,
"hash": "0x67f2f3cde5e94e70615c92629c70d22dc959a118f46e9411b29659c2fce87cdc"
}
]
}
2 changes: 1 addition & 1 deletion examples/volumes/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ RUN apt-get update && apt-get -y upgrade && apt-get install -y \
&& rm -rf /var/lib/apt/lists/*

RUN python3 -m venv /opt/venv
RUN /opt/venv/bin/pip install 'aleph-message==0.4.0'
RUN /opt/venv/bin/pip install 'aleph-message==0.4.1'

CMD mksquashfs /opt/venv /mnt/volume-venv.squashfs
2 changes: 1 addition & 1 deletion packaging/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ debian-package-code:
cp ../examples/instance_message_from_aleph.json ./aleph-vm/opt/aleph-vm/examples/instance_message_from_aleph.json
cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data
mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes
pip3 install --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.0' 'jwskate==0.8.0' 'eth-account==0.9.0' 'sentry-sdk==1.31.0'
pip3 install --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.1' 'jwskate==0.8.0' 'eth-account==0.9.0' 'sentry-sdk==1.31.0' 'qmp==1.1.0'
python3 -m compileall ./aleph-vm/opt/aleph-vm/

debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo
Expand Down
2 changes: 1 addition & 1 deletion packaging/aleph-vm/DEBIAN/control
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ Version: 0.1.8
Architecture: all
Maintainer: Aleph.im
Description: Aleph.im VM execution engine
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils
Section: aleph-im
Priority: Extra
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies = [
"alembic~=1.7.6",
"setproctitle~=1.3.3",
"pyyaml~=6.0.1",
"aleph-message~=0.4.0",
"aleph-message~=0.4.1",
"jwskate~=0.8.0",
"eth-account~=0.9.0",
"sentry-sdk~=1.31.0",
Expand All @@ -42,6 +42,7 @@ dependencies = [
"msgpack~=1.0.7",
"packaging~=23.2",
"jsonschema==4.19.1",
"qmp==0.0.1"
]

[project.urls]
Expand Down
14 changes: 14 additions & 0 deletions src/aleph/vm/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ class Settings(BaseSettings):
# hashlib.sha256(b"secret-token").hexdigest()
ALLOCATION_TOKEN_HASH = "151ba92f2eb90bce67e912af2f7a5c17d8654b3d29895b042107ea312a7eebda"

ENABLE_QEMU_SUPPORT: bool = Field(default=False)

# Tests on programs

FAKE_DATA_PROGRAM: Optional[Path] = None
Expand Down Expand Up @@ -292,6 +294,18 @@ def check(self):
if self.USE_NDP_PROXY:
assert is_command_available("ndppd"), "Command `ndppd` not found, run `apt install ndppd`"

# Necessary for cloud-init customisation of instance
assert is_command_available(
"cloud-localds"
), "Command `cloud-localds` not found, run `apt install cloud-image-utils`"

if settings.ENABLE_QEMU_SUPPORT:
# Qemu support
assert is_command_available("qemu-img"), "Command `qemu-img` not found, run `apt install qemu-utils`"
assert is_command_available(
"qemu-system-x86_64"
), "Command `qemu-system-x86_64` not found, run `apt install qemu-system-x86`"

def setup(self):
os.makedirs(self.MESSAGE_CACHE, exist_ok=True)
os.makedirs(self.CODE_CACHE, exist_ok=True)
Expand Down
40 changes: 18 additions & 22 deletions src/aleph/vm/controllers/firecracker/executable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import asyncio
import logging
import subprocess
from dataclasses import dataclass, field
from multiprocessing import Process, set_start_method
from os.path import exists, isfile
Expand All @@ -17,6 +16,7 @@

from aleph.vm.conf import settings
from aleph.vm.controllers.firecracker.snapshots import CompressedDiskVolumeSnapshot
from aleph.vm.controllers.interface import AlephVmControllerInterface
from aleph.vm.guest_api.__main__ import run_guest_api
from aleph.vm.hypervisors.firecracker.microvm import FirecrackerConfig, MicroVM
from aleph.vm.network.firewall import teardown_nftables_for_vm
Expand Down Expand Up @@ -137,7 +137,7 @@ class VmInitNotConnectedError(Exception):
ConfigurationType = TypeVar("ConfigurationType")


class AlephFirecrackerExecutable(Generic[ConfigurationType]):
class AlephFirecrackerExecutable(Generic[ConfigurationType], AlephVmControllerInterface):
vm_id: int
vm_hash: ItemHash
resources: AlephFirecrackerResources
Expand All @@ -150,6 +150,7 @@ class AlephFirecrackerExecutable(Generic[ConfigurationType]):
guest_api_process: Optional[Process] = None
is_instance: bool
_firecracker_config: Optional[FirecrackerConfig] = None
support_snapshot: bool

def __init__(
self,
Expand Down Expand Up @@ -186,26 +187,6 @@ def __init__(
self.guest_api_process = None
self._firecracker_config = None

def get_vm_ip(self) -> Optional[str]:
if self.tap_interface:
return self.tap_interface.guest_ip.with_prefixlen
return None

def get_vm_route(self) -> Optional[str]:
if self.tap_interface:
return str(self.tap_interface.host_ip).split("/", 1)[0]
return None

def get_vm_ipv6(self) -> Optional[str]:
if self.tap_interface:
return self.tap_interface.guest_ipv6.with_prefixlen
return None

def get_vm_ipv6_gateway(self) -> Optional[str]:
if self.tap_interface:
return str(self.tap_interface.host_ipv6.ip)
return None

def to_dict(self):
"""Dict representation of the virtual machine. Used to record resource usage and for JSON serialization."""
if self.fvm.proc and psutil:
Expand Down Expand Up @@ -301,3 +282,18 @@ async def teardown(self):

async def create_snapshot(self) -> CompressedDiskVolumeSnapshot:
raise NotImplementedError()

async def get_log_queue(self) -> asyncio.Queue:
queue: asyncio.Queue = asyncio.Queue(maxsize=1000)
# Limit the number of queues per VM

if len(self.fvm.log_queues) > 20:
logger.warning("Too many log queues, dropping the oldest one")
self.fvm.log_queues.pop(0)
self.fvm.log_queues.append(queue)
return queue

async def unregister_queue(self, queue: asyncio.Queue):
if queue in self.fvm.log_queues:
self.fvm.log_queues.remove(queue)
queue.empty()
11 changes: 6 additions & 5 deletions src/aleph/vm/controllers/firecracker/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class AlephFirecrackerInstance(AlephFirecrackerExecutable):
resources: AlephInstanceResources
latest_snapshot: Optional[DiskVolumeSnapshot]
is_instance = True
support_snapshot = False

def __init__(
self,
Expand Down Expand Up @@ -118,7 +119,7 @@ async def wait_for_init(self) -> None:
"""Wait for the init process of the instance to be ready."""
assert self.enable_networking and self.tap_interface, f"Network not enabled for VM {self.vm_id}"

ip = self.get_vm_ip()
ip = self.get_ip()
if not ip:
msg = "Host IP not available"
raise ValueError(msg)
Expand Down Expand Up @@ -188,10 +189,10 @@ def _create_network_file(self) -> bytes:

assert self.enable_networking and self.tap_interface, f"Network not enabled for VM {self.vm_id}"

ip = self.get_vm_ip()
route = self.get_vm_route()
ipv6 = self.get_vm_ipv6()
ipv6_gateway = self.get_vm_ipv6_gateway()
ip = self.get_ip()
route = self.get_ip_route()
ipv6 = self.get_ipv6()
ipv6_gateway = self.get_ipv6_gateway()

network = {
"ethernets": {
Expand Down
9 changes: 5 additions & 4 deletions src/aleph/vm/controllers/firecracker/program.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ class AlephFirecrackerProgram(AlephFirecrackerExecutable[ProgramVmConfiguration]
vm_configuration: ProgramVmConfiguration | None
resources: AlephProgramResources
is_instance = False
support_snapshot = False

def __init__(
self,
Expand Down Expand Up @@ -342,14 +343,14 @@ async def _setup_configuration(
machine to send this configuration. Other modes may use Cloud-init, ..."""
reader, writer = await asyncio.open_unix_connection(path=self.fvm.vsock_path)

ip = self.get_vm_ip()
ip = self.get_ip()
if ip:
# The ip and route should not contain the network mask in order to maintain
# compatibility with the existing runtimes.
ip = ip.split("/", 1)[0]
route = self.get_vm_route()
ipv6 = self.get_vm_ipv6()
ipv6_gateway = self.get_vm_ipv6_gateway()
route = self.get_ip_route()
ipv6 = self.get_ipv6()
ipv6_gateway = self.get_ipv6_gateway()

if not settings.DNS_NAMESERVERS:
msg = "Invalid configuration: DNS nameservers missing"
Expand Down
2 changes: 1 addition & 1 deletion src/aleph/vm/controllers/firecracker/snapshot_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def run_snapshots(self) -> None:
job_thread.start()

async def start_for(self, vm: AlephFirecrackerExecutable, frequency: Optional[int] = None) -> None:
if not vm.is_instance:
if not vm.support_snapshot:
msg = "Snapshots are not implemented for programs."
raise NotImplementedError(msg)

Expand Down
Loading

0 comments on commit ef8ae0b

Please sign in to comment.