From 6b557c449c2605806a161b31ad2c51ca52049cbe Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 20 Dec 2024 12:37:23 +0900 Subject: [PATCH 1/7] feat: Enable per-user UID/GID set for containers --- src/ai/backend/agent/agent.py | 60 +++++++-- src/ai/backend/agent/docker/agent.py | 159 +++++++++++++++++------ src/ai/backend/common/types.py | 4 + src/ai/backend/manager/models/session.py | 1 + src/ai/backend/manager/registry.py | 6 + 5 files changed, 181 insertions(+), 49 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 67ef3e4b38..3540265a6f 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -16,6 +16,13 @@ import zlib from abc import ABCMeta, abstractmethod from collections import defaultdict +from collections.abc import ( + Iterable, + Mapping, + MutableMapping, + MutableSequence, + Sequence, +) from decimal import Decimal from io import SEEK_END, BytesIO from pathlib import Path @@ -33,11 +40,7 @@ Generic, List, Literal, - Mapping, - MutableMapping, - MutableSequence, Optional, - Sequence, Set, Tuple, Type, @@ -192,6 +195,21 @@ KernelObjectType = TypeVar("KernelObjectType", bound=AbstractKernel) +def update_additional_gids(environ: Mapping[str, str], gids: Iterable[int]) -> None: + if orig_additional_gids := environ.get("ADDITIONAL_GIDS"): + orig_add_gids = {int(gid) for gid in orig_additional_gids.split(",") if gid} + additional_gids = orig_add_gids | set(gids) + environ = { + **environ, + "ADDITIONAL_GIDS": ",".join(map(str, additional_gids)), + } + else: + environ = { + **environ, + "ADDITIONAL_GIDS": ",".join(map(str, set(gids))), + } + + class AbstractKernelCreationContext(aobject, Generic[KernelObjectType]): kspec_version: int distro: str @@ -514,8 +532,8 @@ def mount_static_binary(filename: str, target_path: str) -> None: environ["LD_PRELOAD"] = "/opt/kernel/libbaihook.so" # Inject ComputeDevice-specific env-varibles and hooks - already_injected_hooks: Set[Path] = set() - additional_gid_set: Set[int] = set() + already_injected_hooks: set[Path] = set() + additional_gid_set: set[int] = set() for dev_type, device_alloc in resource_spec.allocations.items(): computer_ctx = self.computers[dev_type] @@ -552,7 +570,16 @@ def mount_static_binary(filename: str, target_path: str) -> None: environ["LD_PRELOAD"] += ":" + container_hook_path already_injected_hooks.add(hook_path) - environ["ADDITIONAL_GIDS"] = ",".join(map(str, additional_gid_set)) + update_additional_gids(environ, additional_gids) + + def get_overriding_uid(self) -> Optional[int]: + return None + + def get_overriding_gid(self) -> Optional[int]: + return None + + def get_supplementary_gids(self) -> set[int]: + return set() KernelCreationContextType = TypeVar( @@ -1845,11 +1872,22 @@ async def create_kernel( environ: dict[str, str] = {**kernel_config["environ"]} # Inject Backend.AI-intrinsic env-variables for gosu + if (ouid := ctx.get_overriding_uid()) is not None: + environ["LOCAL_USER_ID"] = str(ouid) + else: + if KernelFeatures.UID_MATCH in ctx.kernel_features: + uid = self.local_config["container"]["kernel-uid"] + environ["LOCAL_USER_ID"] = str(uid) + + gids: list[int] = [] + if (ogid := ctx.get_overriding_gid()) is not None: + gids.append(ogid) if KernelFeatures.UID_MATCH in ctx.kernel_features: - uid = self.local_config["container"]["kernel-uid"] - gid = self.local_config["container"]["kernel-gid"] - environ["LOCAL_USER_ID"] = str(uid) - environ["LOCAL_GROUP_ID"] = str(gid) + gids.append(self.local_config["container"]["kernel-gid"]) + if gids: + environ["LOCAL_GROUP_ID"] = str(gids.pop(0)) + + update_additional_gids(environ, [*gids, *ctx.get_supplementary_gids()]) environ.update( await ctx.get_extra_envs(), ) diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 74c771813a..0e4b32589e 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -11,7 +11,7 @@ import signal import struct import sys -from collections.abc import Mapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence from decimal import Decimal from functools import partial from io import StringIO @@ -26,7 +26,6 @@ List, MutableMapping, Optional, - Sequence, Set, Tuple, Union, @@ -235,6 +234,8 @@ def __init__( self.tmp_dir = tmp_dir self.config_dir = scratch_dir / "config" self.work_dir = scratch_dir / "work" + self.uid = kernel_config["uid"] + self.gids = kernel_config["gids"] self.port_pool = port_pool self.agent_sockpath = agent_sockpath @@ -249,6 +250,24 @@ def __init__( self.network_plugin_ctx = network_plugin_ctx + @override + def get_overriding_uid(self) -> Optional[int]: + return self.uid + + @override + def get_overriding_gid(self) -> Optional[int]: + if self.gids: + return self.gids[0] + else: + return None + + @override + def get_supplementary_gids(self) -> set[int]: + if self.gids is not None: + return set(self.gids[1:]) + else: + return set() + def _kernel_resource_spec_read(self, filename): with open(filename, "r") as f: resource_spec = KernelResourceSpec.read_from_file(f) @@ -286,6 +305,18 @@ async def prepare_resource_spec(self) -> Tuple[KernelResourceSpec, Optional[Mapp resource_opts = self.kernel_config.get("resource_opts", {}) return resource_spec, resource_opts + def _chown(self, paths: Iterable[Path], uid: Optional[int], gid: Optional[int]) -> None: + if os.geteuid() == 0: # only possible when I am root. + for p in paths: + if KernelFeatures.UID_MATCH in self.kernel_features: + _uid = uid if uid is not None else self.local_config["container"]["kernel-uid"] + _gid = gid if gid is not None else self.local_config["container"]["kernel-gid"] + else: + stat = os.stat(p) + _uid = uid if uid is not None else stat.st_uid + _gid = gid if gid is not None else stat.st_gid + os.chown(p, _uid, _gid) + async def prepare_scratch(self) -> None: loop = current_loop() @@ -345,18 +376,35 @@ def _clone_dotfiles(): shutil.copy(zshrc_path.resolve(), self.work_dir / ".zshrc") shutil.copy(vimrc_path.resolve(), self.work_dir / ".vimrc") shutil.copy(tmux_conf_path.resolve(), self.work_dir / ".tmux.conf") - if KernelFeatures.UID_MATCH in self.kernel_features: - uid = self.local_config["container"]["kernel-uid"] - gid = self.local_config["container"]["kernel-gid"] - if os.geteuid() == 0: # only possible when I am root. - os.chown(self.work_dir, uid, gid) - os.chown(self.work_dir / ".jupyter", uid, gid) - os.chown(self.work_dir / ".jupyter" / "custom", uid, gid) - os.chown(self.work_dir / ".bashrc", uid, gid) - os.chown(self.work_dir / ".bash_profile", uid, gid) - os.chown(self.work_dir / ".zshrc", uid, gid) - os.chown(self.work_dir / ".vimrc", uid, gid) - os.chown(self.work_dir / ".tmux.conf", uid, gid) + + def chown_scratch(uid: Optional[int], gid: Optional[int]) -> None: + paths = [ + self.work_dir, + self.work_dir / ".jupyter", + self.work_dir / ".jupyter" / "custom", + self.work_dir / ".bashrc", + self.work_dir / ".bash_profile", + self.work_dir / ".zshrc", + self.work_dir / ".vimrc", + self.work_dir / ".tmux.conf", + ] + self._chown(paths, uid, gid) + + do_override = False + if (ouid := self.get_overriding_uid()) is not None: + do_override = True + + if (ogid := self.get_overriding_gid()) is not None: + do_override = True + + if do_override: + chown_scratch(ouid, ogid) + else: + if KernelFeatures.UID_MATCH in self.kernel_features: + chown_scratch( + self.local_config["container"]["kernel-uid"], + self.local_config["container"]["kernel-gid"], + ) await loop.run_in_executor(None, _clone_dotfiles) @@ -573,12 +621,25 @@ def _write_config(): priv_key_path.parent.mkdir(parents=True, exist_ok=True) priv_key_path.write_text(sshkey["private_key"]) pub_key_path.write_text(sshkey["public_key"]) - if KernelFeatures.UID_MATCH in self.kernel_features: - uid = self.local_config["container"]["kernel-uid"] - gid = self.local_config["container"]["kernel-gid"] - if os.geteuid() == 0: # only possible when I am root. - os.chown(str(priv_key_path), uid, gid) - os.chown(str(pub_key_path), uid, gid) + + def chown_keyfile(uid: Optional[int], gid: Optional[int]) -> None: + self._chown([priv_key_path, pub_key_path], uid, gid) + + do_override = False + if (ouid := self.get_overriding_uid()) is not None: + do_override = True + + if (ogid := self.get_overriding_gid()) is not None: + do_override = True + + if do_override: + chown_keyfile(ouid, ogid) + else: + if KernelFeatures.UID_MATCH in self.kernel_features: + chown_keyfile( + self.local_config["container"]["kernel-uid"], + self.local_config["container"]["kernel-gid"], + ) priv_key_path.chmod(0o600) if cluster_ssh_port_mapping := cluster_info["cluster_ssh_port_mapping"]: port_mapping_json_path = self.config_dir / "ssh" / "port-mapping.json" @@ -642,6 +703,8 @@ async def prepare_container( cluster_info: ClusterInfo, ) -> DockerKernel: loop = current_loop() + ouid = self.get_overriding_uid() + ogid = self.get_overriding_gid() if self.restarting: pass @@ -651,11 +714,15 @@ async def prepare_container( def _write_user_bootstrap_script(): (self.work_dir / "bootstrap.sh").write_text(bootstrap) - if KernelFeatures.UID_MATCH in self.kernel_features: - uid = self.local_config["container"]["kernel-uid"] - gid = self.local_config["container"]["kernel-gid"] - if os.geteuid() == 0: - os.chown(self.work_dir / "bootstrap.sh", uid, gid) + if ouid is not None or ogid is not None: + self._chown([self.work_dir / "bootstrap.sh"], ouid, ogid) + else: + if KernelFeatures.UID_MATCH in self.kernel_features: + self._chown( + [self.work_dir / "bootstrap.sh"], + self.local_config["container"]["kernel-uid"], + self.local_config["container"]["kernel-gid"], + ) await loop.run_in_executor(None, _write_user_bootstrap_script) @@ -717,14 +784,24 @@ def _populate_ssh_config(): (ssh_dir / "id_rsa").chmod(0o600) (self.work_dir / "id_container").write_bytes(privkey) (self.work_dir / "id_container").chmod(0o600) - if KernelFeatures.UID_MATCH in self.kernel_features: - uid = self.local_config["container"]["kernel-uid"] - gid = self.local_config["container"]["kernel-gid"] - if os.geteuid() == 0: # only possible when I am root. - os.chown(ssh_dir, uid, gid) - os.chown(ssh_dir / "authorized_keys", uid, gid) - os.chown(ssh_dir / "id_rsa", uid, gid) - os.chown(self.work_dir / "id_container", uid, gid) + + def chown_idfile(uid: Optional[int], gid: Optional[int]) -> None: + paths = [ + ssh_dir, + ssh_dir / "authorized_keys", + ssh_dir / "id_rsa", + self.work_dir / "id_container", + ] + self._chown(paths, uid, gid) + + if ouid is not None or ogid is not None: + chown_idfile(ouid, ogid) + else: + if KernelFeatures.UID_MATCH in self.kernel_features: + chown_idfile( + self.local_config["container"]["kernel-uid"], + self.local_config["container"]["kernel-gid"], + ) await loop.run_in_executor(None, _populate_ssh_config) @@ -742,14 +819,20 @@ def _populate_ssh_config(): await loop.run_in_executor(None, file_path.write_text, dotfile["data"]) tmp = Path(file_path) + tmp_paths: list[Path] = [] while tmp != self.work_dir: tmp.chmod(int(dotfile["perm"], 8)) - # only possible when I am root. - if KernelFeatures.UID_MATCH in self.kernel_features and os.geteuid() == 0: - uid = self.local_config["container"]["kernel-uid"] - gid = self.local_config["container"]["kernel-gid"] - os.chown(tmp, uid, gid) + tmp_paths.append(tmp) tmp = tmp.parent + if ouid is not None or ogid is not None: + self._chown(tmp_paths, ouid, ogid) + else: + if KernelFeatures.UID_MATCH in self.kernel_features: + self._chown( + tmp_paths, + self.local_config["container"]["kernel-uid"], + self.local_config["container"]["kernel-gid"], + ) kernel_obj = DockerKernel( self.kernel_id, diff --git a/src/ai/backend/common/types.py b/src/ai/backend/common/types.py index b01fe58cf0..c44a3d8adc 100644 --- a/src/ai/backend/common/types.py +++ b/src/ai/backend/common/types.py @@ -1090,6 +1090,8 @@ class KernelCreationConfig(TypedDict): cluster_role: str # the kernel's role in the cluster cluster_idx: int # the kernel's index in the cluster cluster_hostname: str # the kernel's hostname in the cluster + uid: Optional[int] + gids: Optional[list[int]] resource_slots: Mapping[str, str] # json form of ResourceSlot resource_opts: Mapping[str, str] # json form of resource options environ: Mapping[str, str] @@ -1120,6 +1122,8 @@ class KernelEnqueueingConfig(TypedDict): creation_config: dict bootstrap_script: str startup_command: Optional[str] + uid: Optional[int] + gids: Optional[list[int]] def _stringify_number(v: Union[BinarySize, int, float, Decimal]) -> str: diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 041e5e4b5e..f91804c889 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -1151,6 +1151,7 @@ async def get_session( selectinload(KernelRow.agent_row).noload("*"), ), ]) + _eager_loading_op.append(joinedload(SessionRow.user)) session_list = await cls.match_sessions( db_session, diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index a3489c4247..269a515b6c 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -619,6 +619,8 @@ async def create_session( "creation_config": config, "kernel_configs": [ { + "uid": sess.user.container_uid, + "gids": sess.user.container_gids, "image_ref": image_ref, "cluster_role": DEFAULT_ROLE, "cluster_idx": 1, @@ -1322,6 +1324,8 @@ async def enqueue_session( if not kernel["cluster_hostname"] else kernel["cluster_hostname"] ), + "uid": kernel["uid"], + "gids": kernel["gids"], "image": image_ref.canonical, # "image_id": image_row.id, "architecture": image_ref.architecture, @@ -1839,6 +1843,8 @@ def get_image_conf(kernel: KernelRow) -> ImageConfig: "package_directory": tuple(), "local_rank": binding.kernel.local_rank, "cluster_hostname": binding.kernel.cluster_hostname, + "uid": binding.kernel.uid, + "gids": binding.kernel.gids, "idle_timeout": int(idle_timeout), "mounts": [item.to_json() for item in scheduled_session.vfolder_mounts], "environ": { From 8d2cde60dc954ae5b1ff11cffcca13aae687a20d Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 20 Dec 2024 13:48:05 +0900 Subject: [PATCH 2/7] add news fragment --- changes/3279.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3279.feature.md diff --git a/changes/3279.feature.md b/changes/3279.feature.md new file mode 100644 index 0000000000..49aed8c5e0 --- /dev/null +++ b/changes/3279.feature.md @@ -0,0 +1 @@ +Enable per-user UID/GID set for containers From 67067ea9b6d268d63ff5fcb1f8f8cecf3e1cbf97 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Mon, 23 Dec 2024 16:03:51 +0900 Subject: [PATCH 3/7] Eagerly get uid --- src/ai/backend/agent/agent.py | 6 ++++++ src/ai/backend/agent/docker/agent.py | 6 +++++- src/ai/backend/agent/dummy/agent.py | 4 ++++ src/ai/backend/agent/kubernetes/agent.py | 4 ++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 3540265a6f..f9d6d2e612 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -237,6 +237,7 @@ def __init__( distro: str, local_config: Mapping[str, Any], computers: MutableMapping[DeviceName, ComputerContext], + proc_uid: int, restarting: bool = False, ) -> None: self.image_labels = kernel_config["image"]["labels"] @@ -255,6 +256,7 @@ def __init__( self.computers = computers self.restarting = restarting self.local_config = local_config + self.proc_uid = proc_uid @abstractmethod async def get_extra_envs(self) -> Mapping[str, str]: @@ -613,6 +615,7 @@ class AbstractAgent( computers: MutableMapping[DeviceName, ComputerContext] images: Mapping[str, str] port_pool: Set[int] + proc_uid: int redis: Redis @@ -667,6 +670,7 @@ def __init__( local_config["container"]["port-range"][1] + 1, ) ) + self.proc_uid = os.geteuid() self.stats_monitor = stats_monitor self.error_monitor = error_monitor self._pending_creation_tasks = defaultdict(set) @@ -1741,6 +1745,7 @@ async def init_kernel_context( kernel_image: ImageRef, kernel_config: KernelCreationConfig, *, + proc_uid: int, restarting: bool = False, cluster_ssh_port_mapping: Optional[ClusterSSHPortMapping] = None, ) -> AbstractKernelCreationContext: @@ -1867,6 +1872,7 @@ async def create_kernel( kernel_image, kernel_config, restarting=restarting, + proc_uid=self.proc_uid, cluster_ssh_port_mapping=cluster_info.get("cluster_ssh_port_mapping"), ) environ: dict[str, str] = {**kernel_config["environ"]} diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 0e4b32589e..24cb7c4c6d 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -211,6 +211,7 @@ def __init__( agent_sockpath: Path, resource_lock: asyncio.Lock, network_plugin_ctx: NetworkPluginContext, + proc_uid: int, restarting: bool = False, cluster_ssh_port_mapping: Optional[ClusterSSHPortMapping] = None, gwbridge_subnet: Optional[str] = None, @@ -225,6 +226,7 @@ def __init__( distro, local_config, computers, + proc_uid=proc_uid, restarting=restarting, ) scratch_dir = (self.local_config["container"]["scratch-root"] / str(kernel_id)).resolve() @@ -306,7 +308,7 @@ async def prepare_resource_spec(self) -> Tuple[KernelResourceSpec, Optional[Mapp return resource_spec, resource_opts def _chown(self, paths: Iterable[Path], uid: Optional[int], gid: Optional[int]) -> None: - if os.geteuid() == 0: # only possible when I am root. + if self.proc_uid == 0: # only possible when I am root. for p in paths: if KernelFeatures.UID_MATCH in self.kernel_features: _uid = uid if uid is not None else self.local_config["container"]["kernel-uid"] @@ -1671,6 +1673,7 @@ async def init_kernel_context( kernel_image: ImageRef, kernel_config: KernelCreationConfig, *, + proc_uid: int, restarting: bool = False, cluster_ssh_port_mapping: Optional[ClusterSSHPortMapping] = None, ) -> DockerKernelCreationContext: @@ -1689,6 +1692,7 @@ async def init_kernel_context( self.agent_sockpath, self.resource_lock, self.network_plugin_ctx, + proc_uid=proc_uid, restarting=restarting, cluster_ssh_port_mapping=cluster_ssh_port_mapping, gwbridge_subnet=self.gwbridge_subnet, diff --git a/src/ai/backend/agent/dummy/agent.py b/src/ai/backend/agent/dummy/agent.py index 9450a0d34c..22c71146dc 100644 --- a/src/ai/backend/agent/dummy/agent.py +++ b/src/ai/backend/agent/dummy/agent.py @@ -61,6 +61,7 @@ def __init__( distro: str, local_config: Mapping[str, Any], computers: MutableMapping[DeviceName, ComputerContext], + proc_uid: int, restarting: bool = False, *, dummy_config: Mapping[str, Any], @@ -75,6 +76,7 @@ def __init__( distro, local_config, computers, + proc_uid=proc_uid, restarting=restarting, ) self.dummy_config = dummy_config @@ -313,6 +315,7 @@ async def init_kernel_context( kernel_image: ImageRef, kernel_config: KernelCreationConfig, *, + proc_uid: int, restarting: bool = False, cluster_ssh_port_mapping: Optional[ClusterSSHPortMapping] = None, ) -> DummyKernelCreationContext: @@ -327,6 +330,7 @@ async def init_kernel_context( distro, self.local_config, self.computers, + proc_uid=proc_uid, restarting=restarting, dummy_config=self.dummy_config, ) diff --git a/src/ai/backend/agent/kubernetes/agent.py b/src/ai/backend/agent/kubernetes/agent.py index ac74f5ea8f..6fc2d3d70f 100644 --- a/src/ai/backend/agent/kubernetes/agent.py +++ b/src/ai/backend/agent/kubernetes/agent.py @@ -112,6 +112,7 @@ def __init__( computers: MutableMapping[DeviceName, ComputerContext], workers: Mapping[str, Mapping[str, str]], static_pvc_name: str, + proc_uid: int, restarting: bool = False, ) -> None: super().__init__( @@ -124,6 +125,7 @@ def __init__( distro, local_config, computers, + proc_uid=proc_uid, restarting=restarting, ) scratch_dir = (self.local_config["container"]["scratch-root"] / str(kernel_id)).resolve() @@ -1038,6 +1040,7 @@ async def init_kernel_context( kernel_image: ImageRef, kernel_config: KernelCreationConfig, *, + proc_uid: int, restarting: bool = False, cluster_ssh_port_mapping: Optional[ClusterSSHPortMapping] = None, ) -> KubernetesKernelCreationContext: @@ -1055,6 +1058,7 @@ async def init_kernel_context( self.computers, self.workers, "backend-ai-static-pvc", + proc_uid=proc_uid, restarting=restarting, ) From 34e1f32c25d29a25adc67e60caf968327584301b Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 2 Jan 2025 14:49:10 +0900 Subject: [PATCH 4/7] little update on update_additional_gids() func --- src/ai/backend/agent/agent.py | 14 +++++--------- src/ai/backend/agent/docker/agent.py | 1 - 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index f9d6d2e612..f46fcaf0f7 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -195,19 +195,13 @@ KernelObjectType = TypeVar("KernelObjectType", bound=AbstractKernel) -def update_additional_gids(environ: Mapping[str, str], gids: Iterable[int]) -> None: +def update_additional_gids(environ: MutableMapping[str, str], gids: Iterable[int]) -> None: if orig_additional_gids := environ.get("ADDITIONAL_GIDS"): orig_add_gids = {int(gid) for gid in orig_additional_gids.split(",") if gid} additional_gids = orig_add_gids | set(gids) - environ = { - **environ, - "ADDITIONAL_GIDS": ",".join(map(str, additional_gids)), - } else: - environ = { - **environ, - "ADDITIONAL_GIDS": ",".join(map(str, set(gids))), - } + additional_gids = set(gids) + environ["ADDITIONAL_GIDS"] = ",".join(map(str, additional_gids)) class AbstractKernelCreationContext(aobject, Generic[KernelObjectType]): @@ -1885,6 +1879,8 @@ async def create_kernel( uid = self.local_config["container"]["kernel-uid"] environ["LOCAL_USER_ID"] = str(uid) + # The first element of the `gids` list becomes the kernel's main gid. + # Elements after the first one become supplementary gids. gids: list[int] = [] if (ogid := ctx.get_overriding_gid()) is not None: gids.append(ogid) diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 24cb7c4c6d..ab062f3d56 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -24,7 +24,6 @@ Final, FrozenSet, List, - MutableMapping, Optional, Set, Tuple, From a36514a5fc0416cb2d987b907a5c3993027b4a41 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 2 Jan 2025 15:36:32 +0900 Subject: [PATCH 5/7] follow up replacing 'gids' to ('main_gid' and 'supplementary_gids') --- src/ai/backend/agent/agent.py | 21 +++++++++---------- src/ai/backend/agent/docker/agent.py | 15 +++++-------- src/ai/backend/common/types.py | 6 ++++-- .../f6ca2f2d04c1_add_uid_and_gid_columns.py | 11 ++++++---- src/ai/backend/manager/models/kernel.py | 4 ++-- src/ai/backend/manager/models/user.py | 5 ++++- src/ai/backend/manager/registry.py | 9 +++++--- 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index f46fcaf0f7..8e3a5dbd7d 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -196,6 +196,8 @@ def update_additional_gids(environ: MutableMapping[str, str], gids: Iterable[int]) -> None: + if not gids: + return if orig_additional_gids := environ.get("ADDITIONAL_GIDS"): orig_add_gids = {int(gid) for gid in orig_additional_gids.split(",") if gid} additional_gids = orig_add_gids | set(gids) @@ -574,8 +576,8 @@ def get_overriding_uid(self) -> Optional[int]: def get_overriding_gid(self) -> Optional[int]: return None - def get_supplementary_gids(self) -> set[int]: - return set() + def get_supplementary_gids(self) -> Optional[list[int]]: + return None KernelCreationContextType = TypeVar( @@ -1879,17 +1881,14 @@ async def create_kernel( uid = self.local_config["container"]["kernel-uid"] environ["LOCAL_USER_ID"] = str(uid) - # The first element of the `gids` list becomes the kernel's main gid. - # Elements after the first one become supplementary gids. - gids: list[int] = [] if (ogid := ctx.get_overriding_gid()) is not None: - gids.append(ogid) - if KernelFeatures.UID_MATCH in ctx.kernel_features: - gids.append(self.local_config["container"]["kernel-gid"]) - if gids: - environ["LOCAL_GROUP_ID"] = str(gids.pop(0)) + environ["LOCAL_GROUP_ID"] = str(ogid) + else: + if KernelFeatures.UID_MATCH in ctx.kernel_features: + gid = self.local_config["container"]["kernel-gid"] + environ["LOCAL_GROUP_ID"] = str(gid) - update_additional_gids(environ, [*gids, *ctx.get_supplementary_gids()]) + update_additional_gids(environ, ctx.get_supplementary_gids() or []) environ.update( await ctx.get_extra_envs(), ) diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index ab062f3d56..e8bda69dfc 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -236,7 +236,8 @@ def __init__( self.config_dir = scratch_dir / "config" self.work_dir = scratch_dir / "work" self.uid = kernel_config["uid"] - self.gids = kernel_config["gids"] + self.main_gid = kernel_config["main_gid"] + self.supplementary_gids = kernel_config["supplementary_gids"] self.port_pool = port_pool self.agent_sockpath = agent_sockpath @@ -257,17 +258,11 @@ def get_overriding_uid(self) -> Optional[int]: @override def get_overriding_gid(self) -> Optional[int]: - if self.gids: - return self.gids[0] - else: - return None + return self.main_gid @override - def get_supplementary_gids(self) -> set[int]: - if self.gids is not None: - return set(self.gids[1:]) - else: - return set() + def get_supplementary_gids(self) -> Optional[list[int]]: + return self.supplementary_gids def _kernel_resource_spec_read(self, filename): with open(filename, "r") as f: diff --git a/src/ai/backend/common/types.py b/src/ai/backend/common/types.py index c44a3d8adc..8eb7f9c017 100644 --- a/src/ai/backend/common/types.py +++ b/src/ai/backend/common/types.py @@ -1091,7 +1091,8 @@ class KernelCreationConfig(TypedDict): cluster_idx: int # the kernel's index in the cluster cluster_hostname: str # the kernel's hostname in the cluster uid: Optional[int] - gids: Optional[list[int]] + main_gid: Optional[int] + supplementary_gids: Optional[list[int]] resource_slots: Mapping[str, str] # json form of ResourceSlot resource_opts: Mapping[str, str] # json form of resource options environ: Mapping[str, str] @@ -1123,7 +1124,8 @@ class KernelEnqueueingConfig(TypedDict): bootstrap_script: str startup_command: Optional[str] uid: Optional[int] - gids: Optional[list[int]] + main_gid: Optional[int] + supplementary_gids: Optional[list[int]] def _stringify_number(v: Union[BinarySize, int, float, Decimal]) -> str: diff --git a/src/ai/backend/manager/models/alembic/versions/f6ca2f2d04c1_add_uid_and_gid_columns.py b/src/ai/backend/manager/models/alembic/versions/f6ca2f2d04c1_add_uid_and_gid_columns.py index 338c801180..1aefcb9237 100644 --- a/src/ai/backend/manager/models/alembic/versions/f6ca2f2d04c1_add_uid_and_gid_columns.py +++ b/src/ai/backend/manager/models/alembic/versions/f6ca2f2d04c1_add_uid_and_gid_columns.py @@ -27,7 +27,10 @@ def upgrade() -> None: op.add_column( "kernels", sa.Column( - "additional_gids", sa.ARRAY(sa.Integer()), server_default=sa.text("NULL"), nullable=True + "supplementary_gids", + sa.ARRAY(sa.Integer()), + server_default=sa.text("NULL"), + nullable=True, ), ) op.add_column( @@ -43,7 +46,7 @@ def upgrade() -> None: op.add_column( "users", sa.Column( - "container_additional_gids", + "container_supplementary_gids", sa.ARRAY(sa.Integer()), server_default=sa.text("NULL"), nullable=True, @@ -53,8 +56,8 @@ def upgrade() -> None: def downgrade() -> None: op.drop_column("users", "container_main_gid") - op.drop_column("users", "container_additional_gids") + op.drop_column("users", "container_supplementary_gids") op.drop_column("users", "container_uid") - op.drop_column("kernels", "additional_gids") + op.drop_column("kernels", "supplementary_gids") op.drop_column("kernels", "main_gid") op.drop_column("kernels", "uid") diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 2872ff8a42..f35156d8a4 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -418,8 +418,8 @@ class KernelRow(Base): ) uid = sa.Column("uid", sa.Integer, nullable=True, server_default=sa.null()) main_gid = sa.Column("main_gid", sa.Integer, nullable=True, server_default=sa.null()) - additional_gids = sa.Column( - "additional_gids", sa.ARRAY(sa.Integer), nullable=True, server_default=sa.null() + supplementary_gids = sa.Column( + "supplementary_gids", sa.ARRAY(sa.Integer), nullable=True, server_default=sa.null() ) # Resource ownership diff --git a/src/ai/backend/manager/models/user.py b/src/ai/backend/manager/models/user.py index a7cd3cb67d..0673b229c8 100644 --- a/src/ai/backend/manager/models/user.py +++ b/src/ai/backend/manager/models/user.py @@ -162,7 +162,10 @@ class UserStatus(enum.StrEnum): sa.Column("container_uid", sa.Integer, nullable=True, server_default=sa.null()), sa.Column("container_main_gid", sa.Integer, nullable=True, server_default=sa.null()), sa.Column( - "container_additional_gids", sa.ARRAY(sa.Integer), nullable=True, server_default=sa.null() + "container_supplementary_gids", + sa.ARRAY(sa.Integer), + nullable=True, + server_default=sa.null(), ), ) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 269a515b6c..c6fb26a37d 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -620,7 +620,8 @@ async def create_session( "kernel_configs": [ { "uid": sess.user.container_uid, - "gids": sess.user.container_gids, + "main_gid": sess.user.container_main_gid, + "supplementary_gids": sess.user.container_supplementary_gids, "image_ref": image_ref, "cluster_role": DEFAULT_ROLE, "cluster_idx": 1, @@ -1325,7 +1326,8 @@ async def enqueue_session( else kernel["cluster_hostname"] ), "uid": kernel["uid"], - "gids": kernel["gids"], + "main_gid": kernel["main_gid"], + "supplementary_gids": kernel["supplementary_gids"], "image": image_ref.canonical, # "image_id": image_row.id, "architecture": image_ref.architecture, @@ -1844,7 +1846,8 @@ def get_image_conf(kernel: KernelRow) -> ImageConfig: "local_rank": binding.kernel.local_rank, "cluster_hostname": binding.kernel.cluster_hostname, "uid": binding.kernel.uid, - "gids": binding.kernel.gids, + "main_gid": binding.kernel.main_gid, + "supplementary_gids": binding.kernel.supplementary_gids, "idle_timeout": int(idle_timeout), "mounts": [item.to_json() for item in scheduled_session.vfolder_mounts], "environ": { From b630681dbe40ef2fd22de1e3f876ce0ecbd998a6 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 2 Jan 2025 17:56:40 +0900 Subject: [PATCH 6/7] use list rather than set --- src/ai/backend/agent/agent.py | 13 ++++++++----- src/ai/backend/agent/docker/agent.py | 4 ++-- src/ai/backend/common/types.py | 4 ++-- src/ai/backend/manager/registry.py | 15 +++++++++++---- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 8e3a5dbd7d..1087bcb1c2 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -576,8 +576,8 @@ def get_overriding_uid(self) -> Optional[int]: def get_overriding_gid(self) -> Optional[int]: return None - def get_supplementary_gids(self) -> Optional[list[int]]: - return None + def get_supplementary_gids(self) -> set[int]: + return set() KernelCreationContextType = TypeVar( @@ -1881,14 +1881,17 @@ async def create_kernel( uid = self.local_config["container"]["kernel-uid"] environ["LOCAL_USER_ID"] = str(uid) + sgids = set(ctx.get_supplementary_gids() or []) + kernel_gid: int = self.local_config["container"]["kernel-gid"] if (ogid := ctx.get_overriding_gid()) is not None: environ["LOCAL_GROUP_ID"] = str(ogid) + if KernelFeatures.UID_MATCH in ctx.kernel_features: + sgids.add(kernel_gid) else: if KernelFeatures.UID_MATCH in ctx.kernel_features: - gid = self.local_config["container"]["kernel-gid"] - environ["LOCAL_GROUP_ID"] = str(gid) + environ["LOCAL_GROUP_ID"] = str(kernel_gid) - update_additional_gids(environ, ctx.get_supplementary_gids() or []) + update_additional_gids(environ, sgids) environ.update( await ctx.get_extra_envs(), ) diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index e8bda69dfc..0b5af3e243 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -237,7 +237,7 @@ def __init__( self.work_dir = scratch_dir / "work" self.uid = kernel_config["uid"] self.main_gid = kernel_config["main_gid"] - self.supplementary_gids = kernel_config["supplementary_gids"] + self.supplementary_gids = set(kernel_config["supplementary_gids"]) self.port_pool = port_pool self.agent_sockpath = agent_sockpath @@ -261,7 +261,7 @@ def get_overriding_gid(self) -> Optional[int]: return self.main_gid @override - def get_supplementary_gids(self) -> Optional[list[int]]: + def get_supplementary_gids(self) -> set[int]: return self.supplementary_gids def _kernel_resource_spec_read(self, filename): diff --git a/src/ai/backend/common/types.py b/src/ai/backend/common/types.py index 8eb7f9c017..0213365ca5 100644 --- a/src/ai/backend/common/types.py +++ b/src/ai/backend/common/types.py @@ -1092,7 +1092,7 @@ class KernelCreationConfig(TypedDict): cluster_hostname: str # the kernel's hostname in the cluster uid: Optional[int] main_gid: Optional[int] - supplementary_gids: Optional[list[int]] + supplementary_gids: list[int] resource_slots: Mapping[str, str] # json form of ResourceSlot resource_opts: Mapping[str, str] # json form of resource options environ: Mapping[str, str] @@ -1125,7 +1125,7 @@ class KernelEnqueueingConfig(TypedDict): startup_command: Optional[str] uid: Optional[int] main_gid: Optional[int] - supplementary_gids: Optional[list[int]] + supplementary_gids: list[int] def _stringify_number(v: Union[BinarySize, int, float, Decimal]) -> str: diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index c6fb26a37d..3a590413f9 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -605,6 +605,11 @@ async def create_session( script, _ = await query_bootstrap_script(conn, owner_access_key) bootstrap_script = script + user_row = await db_sess.scalar( + sa.select(UserRow).where(UserRow.uuid == user_scope.user_uuid) + ) + user_row = cast(UserRow, user_row) + public_sgroup_only = session_type not in PRIVATE_SESSION_TYPES if dry_run: return {} @@ -619,9 +624,11 @@ async def create_session( "creation_config": config, "kernel_configs": [ { - "uid": sess.user.container_uid, - "main_gid": sess.user.container_main_gid, - "supplementary_gids": sess.user.container_supplementary_gids, + "uid": user_row.container_uid, + "main_gid": user_row.container_main_gid, + "supplementary_gids": ( + user_row.container_supplementary_gids or [] + ), "image_ref": image_ref, "cluster_role": DEFAULT_ROLE, "cluster_idx": 1, @@ -1847,7 +1854,7 @@ def get_image_conf(kernel: KernelRow) -> ImageConfig: "cluster_hostname": binding.kernel.cluster_hostname, "uid": binding.kernel.uid, "main_gid": binding.kernel.main_gid, - "supplementary_gids": binding.kernel.supplementary_gids, + "supplementary_gids": binding.kernel.supplementary_gids or [], "idle_timeout": int(idle_timeout), "mounts": [item.to_json() for item in scheduled_session.vfolder_mounts], "environ": { From 72160a4094f210406f43fcb438cdfcf3a1414e0b Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 2 Jan 2025 19:25:23 +0900 Subject: [PATCH 7/7] remove news fragment --- changes/3279.feature.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 changes/3279.feature.md diff --git a/changes/3279.feature.md b/changes/3279.feature.md deleted file mode 100644 index 49aed8c5e0..0000000000 --- a/changes/3279.feature.md +++ /dev/null @@ -1 +0,0 @@ -Enable per-user UID/GID set for containers