diff --git a/src/ai/backend/install/common.py b/src/ai/backend/install/common.py index 1e975ab2a2..7ce0e12c34 100644 --- a/src/ai/backend/install/common.py +++ b/src/ai/backend/install/common.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from rich.text import Text +from ai.backend.common.arch import arch_name_aliases from .types import OSInfo, Platform @@ -16,7 +16,7 @@ async def detect_os(ctx: Context) -> OSInfo: platform_kernel = sys.platform - platform_arch = platform.machine() + platform_arch = arch_name_aliases.get(platform.machine(), platform.machine()) distro: str | None = None uname_s_output = b"" try: @@ -26,7 +26,7 @@ async def detect_os(ctx: Context) -> OSInfo: stderr=asyncio.subprocess.DEVNULL, ) assert p.stdout is not None - uname_s_output = await p.stdout.read() + uname_s_output = (await p.stdout.read()).strip() await p.wait() except OSError: pass @@ -38,17 +38,18 @@ async def detect_os(ctx: Context) -> OSInfo: stderr=asyncio.subprocess.DEVNULL, ) assert p.stdout is not None - lsb_release_output = await p.stdout.read() + lsb_release_output = (await p.stdout.read()).strip() await p.wait() except OSError: pass try: - issue_output = Path("/etc/issue").read_bytes() + issue_output = Path("/etc/issue").read_bytes().strip() except IOError: issue_output = b"" release_metadata = lsb_release_output + b"\n" + issue_output - if uname_s_output == "Darwin": + if uname_s_output == b"Darwin": assert platform_kernel == "darwin" + platform_kernel = "macos" distro = "Darwin" elif ( Path("/etc/debian_version").exists() @@ -70,14 +71,14 @@ async def detect_os(ctx: Context) -> OSInfo: assert platform_kernel == "linux" distro = "SUSE" else: - raise RuntimeError("Unsupported host linux distribution") - os_info = OSInfo( + raise RuntimeError( + "Unsupported host linux distribution: " + f"{uname_s_output.decode()!r}, {release_metadata.decode()!r}" + ) + return OSInfo( platform=Platform(f"{platform_kernel}-{platform_arch}").value, # type: ignore distro=distro, ) - ctx.log.write(Text.from_markup("Detected OS info: ", end="")) - ctx.log.write(os_info) - return os_info async def detect_cuda(ctx: Context) -> None: diff --git a/src/ai/backend/install/context.py b/src/ai/backend/install/context.py index c03efce6d1..bfc8ac7a5a 100644 --- a/src/ai/backend/install/context.py +++ b/src/ai/backend/install/context.py @@ -37,7 +37,12 @@ install_git_lfs, pants_export, ) -from .docker import check_docker, check_docker_desktop_mount, get_preferred_pants_local_exec_root +from .docker import ( + check_docker, + check_docker_desktop_mount, + determine_docker_sudo, + get_preferred_pants_local_exec_root, +) from .http import wget from .python import check_python from .types import ( @@ -63,6 +68,7 @@ class PostGuide(enum.Enum): class Context(metaclass=ABCMeta): os_info: OSInfo + docker_sudo: list[str] _post_guides: list[PostGuide] @@ -90,7 +96,7 @@ def log_header(self, title: str) -> None: def mangle_pkgname(self, name: str, fat: bool = False) -> str: # local-proxy does not have fat variant. (It is always fat.) - if fat and name != "backendai-local-proxy": + if fat and name != "local-proxy": return f"backendai-{name}-fat-{self.os_info.platform}" return f"backendai-{name}-{self.os_info.platform}" @@ -245,10 +251,11 @@ async def install_halfstack(self) -> None: ("8120:2379", f"{self.install_info.halfstack_config.etcd_addr[0].bind.port}:2379"), ], ) + sudo = " ".join(self.docker_sudo) await self.run_shell( - """ - sudo docker compose up -d && \\ - sudo docker compose ps + f""" + {sudo} docker compose up -d && \\ + {sudo} docker compose ps """, cwd=self.install_info.base_path, ) @@ -592,66 +599,75 @@ async def alias_image(self, alias: str, target_ref: str, arch: str) -> None: async def populate_images(self) -> None: data: Any - match self.dist_info.image_source: - case ImageSource.BACKENDAI_REGISTRY: - self.log_header("Scanning and pulling configured Backend.AI container images...") - if self.os_info.platform in (Platform.LINUX_ARM64, Platform.MACOS_ARM64): - project = "stable,community,multiarch" - else: - project = "stable,community" - data = { - "docker": { - "image": { - "auto_pull": "tag", # FIXME: temporary workaround for multiarch - }, - "registry": { - "cr.backend.ai": { - "": "https://cr.backend.ai", - "type": "harbor2", - "project": project, + for image_source in self.dist_info.image_sources: + match image_source: + case ImageSource.BACKENDAI_REGISTRY: + self.log_header( + "Scanning and pulling configured Backend.AI container images..." + ) + if self.os_info.platform in (Platform.LINUX_ARM64, Platform.MACOS_ARM64): + project = "stable,community,multiarch" + else: + project = "stable,community" + data = { + "docker": { + "image": { + "auto_pull": "tag", # FIXME: temporary workaround for multiarch + }, + "registry": { + "cr.backend.ai": { + "": "https://cr.backend.ai", + "type": "harbor2", + "project": project, + }, }, }, - }, - } - await self.etcd_put_json("config", data) - await self.run_manager_cli(["mgr", "image", "rescan", "cr.backend.ai"]) - if self.os_info.platform in (Platform.LINUX_ARM64, Platform.MACOS_ARM64): - await self.alias_image( - "python", - "cr.backend.ai/stable/python:3.9-ubuntu20.04", - "aarch64", - ) - else: - await self.alias_image( - "python", - "cr.backend.ai/stable/python:3.9-ubuntu20.04", - "x86_64", + } + await self.etcd_put_json("config", data) + await self.run_manager_cli(["mgr", "image", "rescan", "cr.backend.ai"]) + if self.os_info.platform in (Platform.LINUX_ARM64, Platform.MACOS_ARM64): + await self.alias_image( + "python", + "cr.backend.ai/stable/python:3.9-ubuntu20.04", + "aarch64", + ) + else: + await self.alias_image( + "python", + "cr.backend.ai/stable/python:3.9-ubuntu20.04", + "x86_64", + ) + case ImageSource.DOCKER_HUB: + self.log_header( + "Scanning and pulling configured Docker Hub container images..." ) - case ImageSource.DOCKER_HUB: - self.log_header("Scanning and pulling configured Docker Hub container images...") - data = { - "docker": { - "image": { - "auto_pull": "tag", # FIXME: temporary workaround for multiarch - }, - "registry": { - "index.docker.io": { - "": "https://registry-1.docker.io", - "type": "docker", - "username": "lablup", + data = { + "docker": { + "image": { + "auto_pull": "tag", # FIXME: temporary workaround for multiarch + }, + "registry": { + "index.docker.io": { + "": "https://registry-1.docker.io", + "type": "docker", + "username": "lablup", + }, }, }, - }, - } - await self.etcd_put_json("config", data) - for ref in self.dist_info.image_refs: - await self.run_manager_cli(["mgr", "image", "rescan", ref]) - await self.run_exec(["sudo", "docker", "pull", ref]) - case ImageSource.LOCAL_DIR: - self.log_header("Populating local container images...") - for src in self.dist_info.image_sources: - # TODO: Ensure src.ref - await self.run_exec(["sudo", "docker", "load", "-i", str(src.file)]) + } + await self.etcd_put_json("config", data) + for ref in self.dist_info.image_refs: + await self.run_manager_cli(["mgr", "image", "rescan", ref]) + await self.run_exec([*self.docker_sudo, "docker", "pull", ref]) + case ImageSource.LOCAL_DIR: + self.log_header("Populating local container images...") + for src in self.dist_info.image_payloads: + # TODO: Ensure src.ref + await self.run_exec( + [*self.docker_sudo, "docker", "load", "-i", str(src.file)] + ) + case ImageSource.LOCAL_REGISTRY: + raise NotImplementedError() class DevContext(Context): @@ -705,6 +721,12 @@ def hydrate_install_info(self) -> InstallInfo: async def check_prerequisites(self) -> None: self.os_info = await detect_os(self) + self.log.write(Text.from_markup("Detected OS info: ", end="")) + self.log.write(self.os_info) + if determine_docker_sudo(): + self.docker_sudo = ["sudo"] + else: + self.docker_sudo = [] await install_git_lfs(self) await install_git_hooks(self) await check_python(self) @@ -793,6 +815,12 @@ def hydrate_install_info(self) -> InstallInfo: async def check_prerequisites(self) -> None: self.os_info = await detect_os(self) + self.log.write(Text.from_markup("Detected OS info: ", end="")) + self.log.write(self.os_info) + if determine_docker_sudo(): + self.docker_sudo = ["sudo"] + else: + self.docker_sudo = [] await check_docker(self) if self.os_info.distro == "Darwin": await check_docker_desktop_mount(self) diff --git a/src/ai/backend/install/docker.py b/src/ai/backend/install/docker.py index 24c8a98840..2ded312ef0 100644 --- a/src/ai/backend/install/docker.py +++ b/src/ai/backend/install/docker.py @@ -3,11 +3,14 @@ import asyncio import base64 import hashlib +import json import os import re from pathlib import Path from typing import TYPE_CHECKING +from rich.text import Text + from ai.backend.install.types import PrerequisiteError from .http import request_unix @@ -58,29 +61,77 @@ async def detect_snap_docker(): return pkg_data["version"] -async def detect_system_docker(): +async def detect_system_docker(ctx: Context): + # Well-known docker socket paths sock_paths = [ Path("/run/docker.sock"), # Linux default Path("/var/run/docker.sock"), # macOS default ] + if ctx.docker_sudo: + ctx.log.write( + Text.from_markup("[yellow]Docker commands require sudo. We will use sudo.[/]") + ) + + # Read from context + proc = await asyncio.create_subprocess_exec( + *(*ctx.docker_sudo, "docker", "context", "show"), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + assert proc.stdout is not None + stdout = "" + try: + async with asyncio.timeout(0.5): + stdout = (await proc.stdout.read()).decode().strip() + await proc.wait() + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + raise PrerequisiteError( + "sudo requires prompt.", + instruction="Please make sudo available without password prompts.", + ) + context_name = stdout + proc = await asyncio.create_subprocess_exec( + *(*ctx.docker_sudo, "docker", "context", "inspect", context_name), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + assert proc.stdout is not None + stdout = (await proc.stdout.read()).decode() + await proc.wait() + context_info = json.loads(stdout) + context_sock_path = context_info[0]["Endpoints"]["docker"]["Host"].removeprefix("unix://") + sock_paths.insert(0, Path(context_sock_path)) + + # Read from environment variable if env_sock_path := os.environ.get("DOCKER_HOST", None): # Some special setups like OrbStack may have a custom DOCKER_HOST. env_sock_path = env_sock_path.removeprefix("unix://") sock_paths.insert(0, Path(env_sock_path)) + for sock_path in sock_paths: if sock_path.is_socket(): break else: - return None - proc = await asyncio.create_subprocess_exec( - *["sudo", "chmod", "666", str(sock_path)], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.STDOUT, - ) - assert proc.stdout is not None - stdout = await proc.stdout.read() - if (await proc.wait()) != 0: - raise RuntimeError("Failed to set the docker socket permission", stdout.decode()) + raise RuntimeError( + "Failed to find Docker daemon socket (" + + ", ".join(str(sock_path) for sock_path in sock_paths) + + ")" + ) + ctx.log.write(Text.from_markup(f"[yellow]{sock_path=}[/]")) + if ctx.docker_sudo: + # change the docker socket permission (temporarily) + # so that we could access the docker daemon API directly. + proc = await asyncio.create_subprocess_exec( + *["sudo", "chmod", "666", str(sock_path)], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + assert proc.stdout is not None + stdout = (await proc.stdout.read()).decode() + if (await proc.wait()) != 0: + raise RuntimeError("Failed to set the docker socket permission", stdout) async with request_unix("GET", str(sock_path), "http://localhost/version") as r: if r.status != 200: raise RuntimeError("Failed to query the Docker daemon API") @@ -122,6 +173,23 @@ async def get_preferred_pants_local_exec_root(ctx: Context) -> str: return f"/tmp/{build_root_name}-{build_root_hash}-pants" +async def determine_docker_sudo() -> bool: + proc = await asyncio.create_subprocess_exec( + *("docker", "version"), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + assert proc.stdout is not None + stdout = (await proc.stdout.read()).decode() + if (await proc.wait()) != 0: + if "permission denied" in stdout.lower(): + # installed, requires sudo + return True + raise RuntimeError("Docker client command is not available in the host.") + # installed, does not require sudo + return False + + async def check_docker(ctx: Context) -> None: ctx.log_header("Checking Docker and Docker Compose availability") docker_version = await detect_snap_docker() @@ -130,14 +198,15 @@ async def check_docker(ctx: Context) -> None: if parse_version(docker_version) < (20, 10, 15): fail_with_snap_docker_refresh_request() else: - docker_version = await detect_system_docker() + docker_version = await detect_system_docker(ctx) + ctx.log.write(docker_version) if docker_version is not None: ctx.log.write(f"Detected Docker installation: System package ({docker_version})") else: fail_with_system_docker_install_request() proc = await asyncio.create_subprocess_exec( - "docker", "compose", "version", stdout=asyncio.subprocess.PIPE + *ctx.docker_sudo, "docker", "compose", "version", stdout=asyncio.subprocess.PIPE ) assert proc.stdout is not None stdout = await proc.stdout.read() diff --git a/src/ai/backend/install/types.py b/src/ai/backend/install/types.py index a293607310..a274d3f614 100644 --- a/src/ai/backend/install/types.py +++ b/src/ai/backend/install/types.py @@ -41,7 +41,7 @@ class InstallType(enum.StrEnum): class Platform(enum.StrEnum): LINUX_ARM64 = "linux-aarch64" LINUX_X86_64 = "linux-x86_64" - MACOS_ARM64 = "macos-arm64" + MACOS_ARM64 = "macos-aarch64" MACOS_X86_64 = "macos-x86_64" @@ -76,8 +76,8 @@ class DistInfo(BaseModel): package_dir: Path = Field(default_factory=Path.cwd) use_fat_binary: bool = False target_path: Path = Field(default_factory=lambda: Path.home() / "backendai") - image_source: ImageSource = ImageSource.BACKENDAI_REGISTRY - image_sources: list[LocalImageSource] = Field(default_factory=list) + image_sources: list[ImageSource] = [ImageSource.BACKENDAI_REGISTRY, ImageSource.DOCKER_HUB] + image_payloads: list[LocalImageSource] = Field(default_factory=list) image_refs: list[str] = Field(default_factory=list)