From 5a78843c4a9d9d88e6aa44814edeae674e1329ae Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Tue, 21 Nov 2023 16:47:41 +0000 Subject: [PATCH] doc: Revamp the key concepts part (#1468) Backported-from: main Backported-to: 23.09 --- changes/1468.doc.md | 1 + docs/_static/css/customTheme.css | 66 +++++----- docs/_static/custom.css | 5 + docs/concepts/api-overview.rst | 29 ---- docs/concepts/computing.rst | 175 +++++++++++++++++++++++++ docs/concepts/configuration.rst | 27 ++++ docs/concepts/faq.rst | 22 ++-- docs/concepts/index.rst | 28 +++- docs/concepts/key-concepts.rst | 88 ------------- docs/concepts/monitoring.rst | 16 +++ docs/concepts/networking.rst | 60 +++++++++ docs/concepts/resources.rst | 81 ++++++++++++ docs/concepts/server-architecture.svg | 2 +- docs/concepts/services.rst | 117 +++++++++++++++++ docs/concepts/storage.rst | 111 ++++++++++++++++ docs/concepts/users.rst | 19 +++ docs/concepts/vfolder-dir-quota.svg | 1 + docs/concepts/vfolder-volume-quota.svg | 1 + docs/dev/daily-workflows.rst | 30 ++++- docs/requirements.txt | 13 +- 20 files changed, 717 insertions(+), 175 deletions(-) create mode 100644 changes/1468.doc.md delete mode 100644 docs/concepts/api-overview.rst create mode 100644 docs/concepts/computing.rst create mode 100644 docs/concepts/configuration.rst delete mode 100644 docs/concepts/key-concepts.rst create mode 100644 docs/concepts/monitoring.rst create mode 100644 docs/concepts/networking.rst create mode 100644 docs/concepts/resources.rst create mode 100644 docs/concepts/services.rst create mode 100644 docs/concepts/storage.rst create mode 100644 docs/concepts/users.rst create mode 100644 docs/concepts/vfolder-dir-quota.svg create mode 100644 docs/concepts/vfolder-volume-quota.svg diff --git a/changes/1468.doc.md b/changes/1468.doc.md new file mode 100644 index 0000000000..0f77093fdd --- /dev/null +++ b/changes/1468.doc.md @@ -0,0 +1 @@ +Refine and elaborate the Concepts section to reflect all the new features and concepts added in last 3 years diff --git a/docs/_static/css/customTheme.css b/docs/_static/css/customTheme.css index ccfa125772..e81d5ee611 100644 --- a/docs/_static/css/customTheme.css +++ b/docs/_static/css/customTheme.css @@ -1,9 +1,9 @@ /* --Reset CSS Start-- */ -html, body, div, span, -h1, h2, h3, h4, h5, h6, -p, a, img, strong, -dl, dt, dd, ol, ul, li, -table, article, aside, canvas, +html, body, div, span, +h1, h2, h3, h4, h5, h6, +p, a, img, strong, +dl, dt, dd, ol, ul, li, +table, article, aside, canvas, footer, header, menu, nav, section, mark, code { margin: 0; padding: 0; @@ -70,7 +70,7 @@ ol, ul { -webkit-box-sizing: border-box !important; -moz-box-sizing: border-box !important; -ms-box-sizing: border-box !important; - box-sizing: border-box !important; + box-sizing: border-box !important; } html { @@ -466,7 +466,7 @@ input::placeholder { font-size: 1.4rem; padding: .8rem 1.2rem; } - + input::placeholder { font-size: 1.4rem; } @@ -3499,7 +3499,7 @@ input::placeholder { height: 9rem; border-bottom: 1px solid #000; } - + #backend-ai-header a div { width: 18rem; } @@ -5065,7 +5065,7 @@ hr { .breadcrumb-item a { font-size: 2.308rem; } - + .breadcrumb-item:before, .breadcrumb-item:after { font-size: 2.308rem; padding: 0 .923rem 0 0; @@ -5085,7 +5085,7 @@ hr { .breadcrumb-item a { font-size: 1.8rem; } - + .breadcrumb-item:before, .breadcrumb-item:after { font-size: 1.8rem; padding: 0 1rem; @@ -5674,7 +5674,7 @@ hr { .wy-nav-content { height: 100%; max-width: 1199px; - margin: auto + margin: auto; } .wy-body-mask { @@ -5950,16 +5950,16 @@ img.search-icon { .customized-search-bar { margin-top: 3.072rem; } - + .customized-search-bar input:first-child { padding-right: 5rem; height: 6.142rem; } - + .customized-search-bar input:first-child::placeholder { font-size: 2.3rem; } - + #search-icon-wrapper { position: absolute; width: 3.69rem; @@ -5979,21 +5979,25 @@ img.search-icon { padding-top: 8rem; } + html { + scroll-padding-top: 10rem; + } + .customized-search-bar { display: none; margin-top: 1.5rem; } - + .customized-search-bar input:first-child { width: 29rem; padding-right: 3.5rem; height: 4rem; } - + .customized-search-bar input:first-child::placeholder { font-size: 1.4rem; } - + #search-icon-wrapper { position: absolute; width: 2.4rem; @@ -6401,14 +6405,14 @@ div.rst-other-versions small a { .rst-content .admonition-title:before { font-size: 2.918rem; margin-right: 1.536rem; - } + } } @media (min-width: 1200px) { .rst-content .admonition-title:before { font-size: 1.9rem; margin-right: 1rem; - } + } } .rst-content .admonition.warning p { @@ -6432,7 +6436,7 @@ div.rst-other-versions small a { .rst-content .admonition.warning .admonition-title:before { font-size: 1.9rem; margin-right: 1rem; - } + } } .rst-content .admonition.danger .admonition-title:before { @@ -6445,14 +6449,14 @@ div.rst-other-versions small a { .rst-content .admonition.danger .admonition-title:before { font-size: 2.918rem; margin-right: 1.536rem; - } + } } @media (min-width: 1200px) { .rst-content .admonition.danger .admonition-title:before { font-size: 1.9rem; margin-right: 1rem; - } + } } .rst-content .admonition.tip .admonition-title:before { @@ -6465,14 +6469,14 @@ div.rst-other-versions small a { .rst-content .admonition.tip .admonition-title:before { font-size: 2.918rem; margin-right: 1.536rem; - } + } } @media (min-width: 1200px) { .rst-content .admonition.tip .admonition-title:before { font-size: 1.9rem; margin-right: 1rem; - } + } } .rst-content .admonition.hint .admonition-title:before { @@ -7115,7 +7119,7 @@ tbody tr td .admonition.warning p, tbody tr td .admonition.note p { div.versionchanged, div.versionchanged p { font-size: 1.536rem; } - + tbody tr td div.versionchanged .versionmodified, tbody tr td div.versionadded .versionmodified { font-size: 1.536rem; } @@ -7269,7 +7273,7 @@ div.rst-footer-buttons a:last-child { div.rst-footer-buttons a:last-child { padding-right: 0; border-left: none; - } + } } @media (min-width: 600px) { @@ -7296,11 +7300,11 @@ div.rst-footer-buttons a:last-child { div.rst-footer-buttons a:last-child { padding-left: 8.063rem; - + ::after { margin-left: 1.229rem; } - + } } @@ -7343,7 +7347,7 @@ div.rst-footer-buttons a:last-child { div.rst-footer-buttons a:first-child { padding-right: 0; - + ::before { margin-right: .2rem; } @@ -7491,8 +7495,8 @@ span.linenos { color: inherit; background-color: transparent; padding-left: 5px; td.linenos .special { color: #767676; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } span.linenos.special { color: #767676; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } .highlight .hll { background-color: #ffffcc } -.highlight { - background: #1E1E1E; +.highlight { + background: #1E1E1E; color: #FFF; } .highlight .c { color: #8f5902; font-style: italic } /* Comment */ diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 3adc638ee3..b874b0b8f4 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -107,3 +107,8 @@ code, .code, .pre, tt { .rst-content .descclassname { font-weight: normal !important; } + +.highlight .err { + border: 0 none; + color: inherit; +} diff --git a/docs/concepts/api-overview.rst b/docs/concepts/api-overview.rst deleted file mode 100644 index 44e1a43b23..0000000000 --- a/docs/concepts/api-overview.rst +++ /dev/null @@ -1,29 +0,0 @@ -API Overview -============ - -Backend.AI API v3 consists of two parts: User APIs and Admin APIs. - -.. warning:: - - APIv3 breaks backward compatibility a lot, and we will primarily support v3 after June 2017. - Please upgrade your clients immediately. - -API KeyPair Registration ------------------------- - -For managed, best-experience service, you may register to our cloud version of Backend.AI API service instead of installing it to your own machines. -Simply create an account at `cloud.backend.ai `_ and generate a new API keypair. -You may also use social accounts for log-ins such as Twitter, Facebook, and GitHub. - -An API keypair is composed of a 20-characters access key (``AKIA...``) and a 40-characters secret key, in a similar form to AWS access keys. - -Currently, the service is BETA: it is free of charge but each user is limited to have only one keypair and have up to 5 concurrent sessions for a given keypair. -Keep you eyes on further announcements for upgraded paid plans. - -Accessing Admin APIs --------------------- - -The admin APIs require a special keypair with the admin privilege: - -* The public cloud service (``api.backend.ai``): It currently does *not* offer any admin privileges to the end-users, as its functionality is already available via our management console at `cloud.backend.ai `_. -* On-premise installation: You will get an auto-generated admin keypair during installation. diff --git a/docs/concepts/computing.rst b/docs/concepts/computing.rst new file mode 100644 index 0000000000..bd44e080b6 --- /dev/null +++ b/docs/concepts/computing.rst @@ -0,0 +1,175 @@ +.. role:: raw-html-m2r(raw) + :format: html + +.. |br| raw:: html + +
+ +Computing +========= + +Sessions and kernels +-------------------- +:raw-html-m2r:`` +:raw-html-m2r:`` + +Backend.AI spawns *sessions* to host various kinds of computation with associated computing resources. +Each session may have one or more *kernels*. +We call sessions with multiple kernels as "cluster sessions". + +A *kernel* represents an isolated unit of computation such as a container, a virtual machine, a native process, or even a Kubernetes pod, +depending on the Agent's backed implementation and configurations. +The most common form of a kernel is a Docker container. +For container or VM-based kernels, they are also associated with the base images. +The most common form of a base image is `the OCI container images `_. + +Kernel roles in a cluster session +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In a cluster session with multiple kernels, each kernel has a role. +By default, the first container takes the "main" role while others takes the "sub" role. +All kernels are given unique hostnames like "main1", "sub1", "sub2", ..., and "subN" (the cluster size is N+1 in this case). +A non-cluster session has one "main1" kernel only. + +All interactions with a session are routed to its "main1" kernel, +while the "main1" kernel is allowed to access all other kernels via a private network. + +.. seealso:: + + :ref:`concept-cluster-networking` + +Session templates +----------------- + +A session template is a predefined set of parameters to create a session, while they can be overriden by the caller. +It may define additional kernel roles for a cluster session, with different base images and resource specifications. + +Session types +------------- + +There are several classes of sessions for different purposes having different features. + +.. list-table:: Features by the session type + :header-rows: 1 + :stub-columns: 1 + + * - Feature + - Compute |br| (Interactive) + - Compute |br| (Batch) + - Inference + - System + * - Code execution + - ✓ + - ✗ + - ✗ + - ✗ + * - Service port + - ✓ + - ✓ + - ✓ + - ✓ + * - Dependencies + - ✗ + - ✓ + - ✗ + - ✗ + * - Session result + - ✗ + - ✓ + - ✗ + - ✗ + * - Clustering + - ✓ + - ✓ + - ✓ + - ✓ + +Compute session is the most generic form of session to host computations. +It has two operation modes: *interactive* and *batch*. + +Interactive compute session +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Interactive compute sessions are used to run various interactive applications and development tools, +such as Jupyter Notebooks, web-based terminals, and etc. +It is expected that the users control their lifecycles (e.g., terminating them) +while Backend.AI offers configuration knobs for the administrators to set idle timeouts with various criteria. + +There are two major ways to interact with an interactive compute session: *service ports* and *the code execution API*. + +Service ports + +TODO: port mapping diagram + +Code execution + +TODO: execution API state diagram + +Batch compute session +~~~~~~~~~~~~~~~~~~~~~ + +Batch compute sessions are used to host a "run-to-completion" script with a finite execution time. +It has two result states: SUCCESS or FAILED, which is defined by whether the main program's exit code is zero or not. + +Dependencies between compute sessions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pipelining + +Inference session +~~~~~~~~~~~~~~~~~ + +Service endpoint and routing + +Auto-scaling + +System session +~~~~~~~~~~~~~~ + +SFTP access + +.. _concept-scheduler: +Scheduling +---------- + +Backend.AI keeps track of sessions using a state-machine to represent the various lifecycle stages of them. + +TODO: session/kernel state diagram + +TODO: two-level scheduler architecture diagram + +.. seealso:: + + :ref:`concept-resource-group` + +Session selection strategy +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Heuristic FIFO +^^^^^^^^^^^^^^ + +The default session selection strategy is the heuristic FIFO. +It mostly works like a FIFO queue to select the oldest pending session, +but offers an option to enable a head-of-line (HoL) blocking avoidance logic. + +The HoL blocking problem happens when the oldest pending session requires too much resources so that it cannot be scheduled +while other subsequent pending sessions fit within the available cluster resources. +Those subsequent pending sessions that can be started never have chances until the oldest pending session ("blocker") is either cancelled or more running sessions terminate and release more cluster resources. + +When enabled, the HoL blocking avoidance logic keeps track of the retry count of scheduling attempts of each pending session and pushes back the pending sessions whose retry counts exceed a certain threshold. +This option should be explicitly enabled by the administrators or during installation. + +Dominant resource fairness (DRF) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Agent selection strategy +~~~~~~~~~~~~~~~~~~~~~~~~ + +Concentrated +^^^^^^^^^^^^ + +Dispersed +^^^^^^^^^ + +Custom +^^^^^^ diff --git a/docs/concepts/configuration.rst b/docs/concepts/configuration.rst new file mode 100644 index 0000000000..f7e469fd2e --- /dev/null +++ b/docs/concepts/configuration.rst @@ -0,0 +1,27 @@ +.. role:: raw-html-m2r(raw) + :format: html + +Configuration +------------- + +Shared config +^^^^^^^^^^^^^ +:raw-html-m2r:`` + +Most cluster-level configurations are stored in an Etcd service. +The Etcd server is also used for service discovery; when new agents boot up they register themselves to the cluster manager via etcd. +For production deployments, we recommend to use an Etcd cluster composed of odd (3, 5, or higher) number of nodes to keep high availability. + +Local config +^^^^^^^^^^^^ + +Each service component has a `TOML `_-based local configuration. +It defines node-specific configurations such as the agent name, the resource group where it belongs, specific system limits, the IP address and the TCP port(s) to bind their service traffic, and etc. + +The configuration files are named after the service components, like ``manager.toml``, ``agent.toml``, and ``storage-proxy.toml``. +The search paths are: the current working directory, ``~/.config/backend.ai``, and ``/etc/backend.ai``. + +.. seealso:: + + `The sample configurations in our source repository `_. + Inside each component directory, ``sample.toml`` contains the full configuration schema and descriptions. diff --git a/docs/concepts/faq.rst b/docs/concepts/faq.rst index 5d64968c60..24ed80a11e 100644 --- a/docs/concepts/faq.rst +++ b/docs/concepts/faq.rst @@ -8,11 +8,11 @@ FAQ * - Product - Role - - Problem and Solution + - Value * - Apache Zeppelin, Jupyter Notebook - Notebook-style document + code *frontends* - - Insecure host resource sharing + - Familiarity from data scientists and researchers, but hard to avoid insecure host resource sharing * - **Backend.AI** - Pluggable *backend* to any frontends @@ -28,17 +28,17 @@ FAQ - Value * - Amazon ECS, Kubernetes - - Long-running service daemons + - Long-running interactive services - Load balancing, fault tolerance, incremental deployment - * - **Backend.AI** - - Stateful compute sessions - - Low-cost high-density computation - * - Amazon Lambda, Azure Functions - - Stateless, light-weight functions + - Stateless light-weight, short-lived functions - Serverless, zero-management + * - **Backend.AI** + - Stateful batch computations mixed with interactive applications + - Low-cost high-density computation, maximization of hardware potentials + .. rubric:: vs. Big-data and AI Frameworks .. list-table:: @@ -46,15 +46,15 @@ FAQ * - Product - Role - - Problem and Solution + - Value * - TensorFlow, Apache Spark, Apache Hive - Computation runtime - - Difficult to install, configure, and operate + - Difficult to install, configure, and operate at scale * - Amazon ML, Azure ML, GCP ML - Managed MLaaS - - Still complicated for scientists, too restrictive for engineers + - Highly scalable but dependent on each platform, still requires system engineering backgrounds * - **Backend.AI** - Host of computation runtimes diff --git a/docs/concepts/index.rst b/docs/concepts/index.rst index 386105b377..f5b736b427 100644 --- a/docs/concepts/index.rst +++ b/docs/concepts/index.rst @@ -1,9 +1,31 @@ +.. role:: raw-html-m2r(raw) + :format: html Backend.AI Concepts =================== +Here we describe the key concepts that are required to understand and follow this documentation. + +.. _server-arch-diagram: +.. figure:: server-architecture.svg + :align: center + + The diagram of a typical multi-node Backend.AI server architecture + +:numref:`server-arch-diagram` shows a brief Backend.AI server-side architecture where the components are what you need to install and configure. + +Each border-connected group of components is intended to be run on the same server, but you may split them into multiple servers or merge different groups into a single server as you need. +For example, you can run separate servers for the nginx reverse-proxy and the Backend.AI manager or run both on a single server. +In the :doc:`development setup `, all these components run on a single PC such as your laptop. + .. toctree:: - :maxdepth: 2 + :maxdepth: 4 - key-concepts - api-overview + services + computing + resources + users + networking + storage + configuration + monitoring faq diff --git a/docs/concepts/key-concepts.rst b/docs/concepts/key-concepts.rst deleted file mode 100644 index 859f606489..0000000000 --- a/docs/concepts/key-concepts.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. role:: raw-html-m2r(raw) - :format: html - - -Key Concepts ------------- - -Here we describe the key concepts that are required to understand and follow this documentation. - -.. _server-arch-diagram: -.. figure:: server-architecture.svg - - The diagram of a typical multi-node Backend.AI server architecture - -:numref:`server-arch-diagram` shows a brief Backend.AI server-side architecture where the components are what you need to install and configure. - -Each border-connected group of components is intended to be run on the same server, but you may split them into multiple servers or merge different groups into a single server as you need. -For example, you can run separate servers for the nginx reverse-proxy and the Backend.AI manager or run both on a single server. -In the :doc:`development setup `, all these components run on a single PC such as your laptop. - -Manager and Agents -^^^^^^^^^^^^^^^^^^ -:raw-html-m2r:`` - -Backend.AI manager is the central governor of the cluster. -It accepts user requests, creates/destroys the sessions, and routes code execution requests to appropriate agents and sessions. -It also collects the output of sessions and responds the users with them. - -Backend.AI agent is a small daemon installed onto individual worker servers to control them. -It manages and monitors the lifecycle of kernel containers, and also mediates the input/output of sessions. -Each agent also reports the resource capacity and status of its server, so that the manager can assign new sessions on idle servers to load balance. - -Compute sessions and Kernels -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:raw-html-m2r:`` -:raw-html-m2r:`` - -Backend.AI spawns compute sessions as the form of containers upon user API requests. -Each compute session may have one or more containers (distributed across different nodes), and we call those member containers "kernels". -Such multi-container sessions are for distributed and parallel computation at large scales. -The agent automatically pulls and updates the kernel images if needed. - -Cluster Networking -^^^^^^^^^^^^^^^^^^ -:raw-html-m2r:`` -:raw-html-m2r:`` - -The primary networking requirements are: - -* The manager server (the HTTPS 443 port) should be exposed to the public Internet or the network that your client can access. -* The manager, agents, and all other database/storage servers should reside at the same local private network where any traffic between them are transparently allowed. -* For high-volume big-data processing, you may want to separate the network for the storage using a secondary network interface on each server, such as Infiniband and RoCE adaptors. - -Databases -^^^^^^^^^ -:raw-html-m2r:`` - -Redis and PostgreSQL are used to keep track of liveness of agents and compute sessions (which may be composed of one or more kernels). -They also store user metadata such as keypairs and resource usage statistics. - -Configuration Management -^^^^^^^^^^^^^^^^^^^^^^^^ -:raw-html-m2r:`` - -Most cluster-level configurations are stored in an etcd server or cluster. -The etcd server is also used for service discovery; when new agents boot up they register themselves to the cluster manager via etcd. -For production deployments, we recommend to use an etcd cluster composed of odd (3 or higher) number of nodes to keep high availability. - -Virtual Folders -^^^^^^^^^^^^^^^ -:raw-html-m2r:`` - -.. _vfolder-concept-diagram: -.. figure:: vfolder-concept.svg - - A conceptual diagram of virtual folders when using two NFS servers as vfolder hosts - -As shown in :numref:`vfolder-concept-diagram`, Backend.AI abstracts network storages as "virtual folder", which provides a cloud-like private file storage to individual users. -The users may create their own (one or more) virtual folders to store data files, libraries, and program codes. -Each vfolder (virtual folder) is created under a designated storage mount (called "vfolder hosts"). -Virtual folders are mounted into compute session containers at ``/home/work/{name}`` so that user programs have access to the virtual folder contents like a local directory. -As of Backend.AI v18.12, users may also share their own virtual folders with other users in differentiated permissions such as read-only and read-write. - -A Backend.AI cluster setup may use any filesystem that provides a local mount point at each node (including the manager and agents) given that the filesystem contents are synchronized across all nodes. -The only requirement is that the local mount-point must be same across all cluster nodes (e.g., ``/mnt/vfroot/mynfs``). -Common setups may use a centralized network storage (served via NFS or SMB), but for more scalability, one might want to use distributed file systems such as CephFS and GlusterFS, or Alluxio that provides fast in-memory cache while backed by another storage server/service such as AWS S3. - -For a single-node setup, you may simply use an empty local directory. diff --git a/docs/concepts/monitoring.rst b/docs/concepts/monitoring.rst new file mode 100644 index 0000000000..fc2318aa52 --- /dev/null +++ b/docs/concepts/monitoring.rst @@ -0,0 +1,16 @@ +.. role:: raw-html-m2r(raw) + :format: html + +Monitoring +---------- + +Dashboard (Enterprise only) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Backend.AI Dashboard is an add-on service that displays various real-time and historical performance metrics. +The metrics include the number of sessions, cluster power usage, GPU utilization, and etc. + +Alerts (Enterprise only) +~~~~~~~~~~~~~~~~~~~~~~~~ + +Administrators may configure automatic alerts based on several thrsholds on the monitored metrics, via an external messaging service like emails and SMS. diff --git a/docs/concepts/networking.rst b/docs/concepts/networking.rst new file mode 100644 index 0000000000..889a00957b --- /dev/null +++ b/docs/concepts/networking.rst @@ -0,0 +1,60 @@ +.. role:: raw-html-m2r(raw) + :format: html + +.. _concept-cluster-networking: +Cluster Networking +------------------ + +Single-node cluster session +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If a session is created with multiple containers with a single-node option, all containers are created in a single agent. +The containers share a private bridge network in addition to the default network, so that they could interact with each other privately. +There are no firewall restrictions in this private bridge network. + +Multi-node cluster session +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For even larger-scale computation, you may create a multi-node cluster session that spans across multiple agents. +In this case, the manager auto-configures a private overlay network, so that the containers could interact with each other. +There are no firewall restrictions in this private overlay network. + +Detection of clustered setups +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is a concept called *cluster role*. +The current version of Backend.AI creates homogeneous cluster sessions by replicating the same resource configuration and the same container image, +but we have plans to add heterogeneous cluster sessions that have different resource and image configurations for each cluster role. +For instance, a Hadoop cluster may have two types of containers: name nodes and data nodes, where they could be mapped to ``main`` and ``sub`` cluster roles. + +All interactive apps are executed only in the ``main1`` container which is always present in both cluster and non-cluster sessions. +It is the user application's responsibility to connect with and utilize other containers in a cluster session. +To ease the process, Backend.AI injects the following environment variables into the containers and sets up a random-generated SSH keypairs between the containers so that each container ssh into others without additional prompts.: + +.. list-table:: + :header-rows: 1 + + * - Environment Variable + - Meaning + - Examples + * - ``BACKENDAI_CLUSTER_SIZE`` + - The number of containers in this cluster session. + - ``4`` + * - ``BACKENDAI_CLUSTER_HOSTS`` + - A comma-separated list of container hostnames in this cluster session. + - ``main1,sub1,sub2,sub3`` + * - ``BACKENDAI_CLUSTER_REPLICAS`` + - A comma-separated key:value pairs of cluster roles and the replica counts for each role. + - ``main:1,sub:3`` + * - ``BACKENDAI_CLUSTER_HOST`` + - The container hostname of the current container. + - ``main1`` + * - ``BACKENDAI_CLUSTER_IDX`` + - The one-based index of the current container from the containers sharing the same cluster role. + - ``1`` + * - ``BACKENDAI_CLUSTER_ROLE`` + - The name of the current container's cluster role. + - ``main`` + * - ``BACKENDAI_CLUSTER_LOCAL_RANK`` + - The zero-based global index of the current container within the entire cluster session. + - ``0`` diff --git a/docs/concepts/resources.rst b/docs/concepts/resources.rst new file mode 100644 index 0000000000..8ffcdb9b20 --- /dev/null +++ b/docs/concepts/resources.rst @@ -0,0 +1,81 @@ +.. role:: raw-html-m2r(raw) + :format: html + +Resource Management +=================== + +Resource slots +-------------- + +Backend.AI abstracts each different type of computing resources as a "resource slot". +Resource slots are distinguished by its name consisting of two parts: the device name and the slot name. + +.. list-table:: + :header-rows: 1 + + * - Resource slot name + - Device name + - Slot name + * - ``cpu`` + - ``cpu`` + - (implicitly defined as ``root``) + * - ``mem`` + - ``mem`` + - (implicitly defined as ``root``) + * - ``cuda.device`` + - ``cuda`` + - ``device`` + * - ``cuda.shares`` + - ``cuda`` + - ``shares`` + * - ``cuda.mig-2c10g`` + - ``cuda`` + - ``mig-2c10g`` + +Each resource slot has a slot type as follows: + +.. list-table:: + :header-rows: 1 + + * - Slot type + - Meaning + - Examples + * - ``COUNT`` + - The value of the resource slot is an integer or decimal to represent how many of the device(s) are available/allocated. + It may also represent fractions of devices. + - ``cpu``, ``cuda.device``, ``cuda.shares`` + * - ``BYTES`` + - The value of the resource slot is an integer to represent how many bytes of the resources are available/allocated. + - ``mem`` + * - ``UNIQUE`` + - Only "each one" of the device can be allocated to each different kernel exclusively. + - ``cuda.mig-10g`` + +Compute plugins +--------------- + +Backend.AI administrators may install one or more compute plugins to each agent. +Without any plugin, only the intrinsic ``cpu`` and ``mem`` resource slots are available. + +Each compute plugin may declare one or more resource slots. +The plugin is invoked upon startup of the agent to get the list of devices and the resource slots to report. +Administrators can inspect the per-agent accelerator details provided by the compute plugins in the control panel. + +The most well-known compute plugin is ``cuda_open``, which is included in the open source version. +It declares ``cuda.device`` resource slot that represents each NVIDIA GPU as one unit. + +There is a special compute plugin to simulate non-existent devices: ``mock``. +Developers may put a local configuration to declare an arbitrary set of devices and resource slots to test the schedulers and the frontend. +It is useful to develop integrations with new hardware devices before you get the actual devices on your hands. + +.. _concept-resource-group: +Resource groups +--------------- + +Resource group is a logical group of the Agents with independent schedulers. +Each agent belongs to a single resource group only. +It self-reports which resource group to join when sending the heartbeat messages, but the specified resource group must exist in prior. + +.. seealso:: + + :ref:`concept-scheduler` diff --git a/docs/concepts/server-architecture.svg b/docs/concepts/server-architecture.svg index e3cfef2b7e..22215c8ef3 100644 --- a/docs/concepts/server-architecture.svg +++ b/docs/concepts/server-architecture.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/concepts/services.rst b/docs/concepts/services.rst new file mode 100644 index 0000000000..1997fc5bbb --- /dev/null +++ b/docs/concepts/services.rst @@ -0,0 +1,117 @@ +.. role:: raw-html-m2r(raw) + :format: html + +Service Components +------------------ + +Public-facing services +~~~~~~~~~~~~~~~~~~~~~~ + +Manager and Webserver +^^^^^^^^^^^^^^^^^^^^^ +:raw-html-m2r:`` + +Backend.AI manager is the central governor of the cluster. +It accepts user requests, creates/destroys the sessions, and routes code execution requests to appropriate agents and sessions. +It also collects the output of sessions and responds the users with them. + +Backend.AI agent is a small daemon installed onto individual worker servers to control them. +It manages and monitors the lifecycle of kernel containers, and also mediates the input/output of sessions. +Each agent also reports the resource capacity and status of its server, so that the manager can assign new sessions on idle servers to load balance. + +:raw-html-m2r:`` +:raw-html-m2r:`` + +The primary networking requirements are: + +* The manager server (the HTTPS 443 port) should be exposed to the public Internet or the network that your client can access. +* The manager, agents, and all other database/storage servers should reside at the same local private network where any traffic between them are transparently allowed. +* For high-volume big-data processing, you may want to separate the network for the storage using a secondary network interface on each server, such as Infiniband and RoCE adaptors. + +App Proxy +^^^^^^^^^ + +Backend.AI App Proxy is a proxy to mediate the traffic between user applications and clients like browsers. +It provides the central place to set the networking and firewall policy for the user application traffic. + +It has two operation modes: + +* Port mapping: Individual app instances are mapped with a TCP port taken from a pre-configured range of TCP port range. +* Wildecard subdomain: Individual app instances are mapped with a system-generated subdomain under the given top-level domain. + +Depending on the session type and application launch configurations, it may require an authenticated HTTP session for HTTP-based applications. +For instance, you may enforce authentication for interactive development apps like Jupyter while allow anonymous access for AI model service APIs. + +Storage Proxy +^^^^^^^^^^^^^ + +Backend.AI Storage Proxy is a proxy to offload the large file transfers from the manager. +It also provides an abstraction of underlying storage vendor's acceleration APIs since many storage vendors offer vendor-specific APIs for filesystem operations like scanning of directories with millions of files. +Using the storage proxy, we apply our abstraction models for such filesystem operations and quota management specialized to each vendor API. + +FastTrack (Enterprise only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Backend.AI FastTrack is an add-on service running on top of the manager that features a slick GUI to design and run pipelines of computation tasks. +It makes it easier to monitor the progress of various MLOps pipelines running concurrently, and allows sharing of such pielines in portable ways. + +Resource Management +~~~~~~~~~~~~~~~~~~~ + +Sokovan Orchestrator +^^^^^^^^^^^^^^^^^^^^ + +Backend.AI Sokovan is the central cluster-level scheduler running inside the manager. +It monitors the resource usage of agents and assigns new containers from the job queue to the agents. + +Each :ref:`resource group ` may have separate scheduling policy and options. +The scheduling algorithm may be extended using a common abstract interface. +A scheduler implementation accepts the list of currently running sessions, the list of pending sessions in the job queue, and the current resource usage of target agents. +It then outputs the choice of a pending session to start and the assignment of an agent to host it. + +Agent +^^^^^ + +Backend.AI Agent is a small daemon running at each compute node like a GPU server. +Its main job is to control and monitor the containers via Docker, but also includes an abstraction of various "compute process" backends. +It publishes various types of container-related events so that the manager could react to status updates of containers. + +When the manager assigns a new container, the agent decides the device-level resource mappings for the container considering optimal hardware layouts such as NUMA and the PCIe bus locations of accelerator and network devices. + +Internal services +~~~~~~~~~~~~~~~~~ + +Event bus +^^^^^^^^^ +:raw-html-m2r:`` + +Backend.AI uses Redis to keep track of various real-time information and notify system events to other service components. + +Control Panel (Enterprise only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Backend.AI Control Panel is an add-on service to the manager for advanced management and monitoring. +It provides a dedicated superadmin GUI, featuring batch creation and modification of the users, detailed configuration of various resource policies, and etc. + +Forklift (Enterprise only) +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Backend.AI Forklift is a standalone service that eases building new container images from scratch or importing existing ones that are compatible with Backend.AI. + +Reservoir (Enterprise only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Backend.AI Reservoir is an add-on service to provide open source package mirrors for air-gapped setups. + +Container Registry +^^^^^^^^^^^^^^^^^^ + +Backend.AI supports integration with several common container registry solutions, while open source users may also rely on our official registry service with prebuilt images in https://cr.backend.ai: + +* `Docker's vanilla open-source registry `_ + + - It is simplest to set up but does not provide advanced access controls and namespacing over container images. + +* `Harbor v2 `_ (recommended) + + - It provides a full-fledged container registry service including ACLs with project/user memberships, cloning from/to remote registries, on-premise and cloud deployments, security analysis, and etc. diff --git a/docs/concepts/storage.rst b/docs/concepts/storage.rst new file mode 100644 index 0000000000..2fd3345331 --- /dev/null +++ b/docs/concepts/storage.rst @@ -0,0 +1,111 @@ +.. role:: raw-html-m2r(raw) + :format: html + +Storage Management +------------------ + +Virtual folders +~~~~~~~~~~~~~~~ +:raw-html-m2r:`` + +Backend.AI abstracts network storages as a set of "virtual folders" (aka "vfolders"), which provides a persistent file storage to users and projects. + +When creating a new session, users may connect vfolders to it with read-only or read-write permissions. +If the shared vfolder has limited the permission to read-only, then the user may connect it with the read-only permission only. +Virtual folders are mounted into compute session containers at ``/home/work/{name}`` so that user programs have access to the virtual folder contents like a local directory. +The mounted path inside containers may be customized (e.g., ``/workspace``) for compatibility with existing scripts and codes. +Currently it is not possible to unmount or delete a vfolder when there are any running session connected to it. +For cluster sessions having multiple kernels (containers), the connected vfolders are mounted to all kernels using the same location and the permission. + +For a multi-node setup, the storage volume mounts must be synchronized across all Agent nodes and the Storage Proxy node(s) using the same mount path (e.g., ``/mnt`` or ``/vfroot``). +For a single-node setup, you may simply use an empty local directory, like our ``install-dev.sh`` script (`link `_) does. + +From the perspective of the storage, all vfolders from different Backend.AI users and projects share a single same UID and GID. +This allows a flexible permission sharing between users and projects, while keeping the Linux ownership of the files and directories consistent when they are accessed by multiple different Backend.AI users. + +User-owned vfolders +^^^^^^^^^^^^^^^^^^^ + +The users may create their own one or more virtual folders to store data files, libraries, and program codes. +The superadmins may limit the maximum number of vfolders owned by a user. + +Project-owned vfolders +^^^^^^^^^^^^^^^^^^^^^^ + +The project admins and superadmins may create a vfolder that is automatically shared to all members of the project, +with a specific read-only or read-write permission. + +.. note:: + + If allowed, users and projects may create and access vfolders in multiple different storage volumes, + but the vfolder names must be unique in all storage volumes, for each user and project. + +VFolder invitations and permissions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Users and project administrators may invite other users to collaborate on a vfolder. +Once the invitee accepts the request, he/she gets the designated read-only or read-write permission on the shared vfolder. + +Volume-level permissions +^^^^^^^^^^^^^^^^^^^^^^^^ + +The superadmin may set additional action privileges to each storage volume, +such as whether to allow or block mounting the vfolders in compute sessions, cloning the vfolders, etc. + +Auto-mount vfolders +^^^^^^^^^^^^^^^^^^^ + +If a user-owned vfolder's name starts with a dot, it is automatically mounted at ``/home/work`` for all sessions created by the user. +A good usecase is ``.config`` and ``.local`` directories to keep your local configurations and user-installed packages (e.g., ``pip install --user``) persistent across all your sessions. + + +Quota scopes +~~~~~~~~~~~~ + +.. versionadded:: 23.03 + +Quota scopes implement per-user and per-project storage usage limits. +Currently it supports the hard limits specified in bytes. +There are two main schemes to set up this feature. + +Storage with per-directory quota +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. _vfolder-dir-quota: +.. figure:: vfolder-dir-quota.svg + :width: 80% + :align: center + + Quota scopes and vfolders with storage solutions supporting per-directry quota + +For each storage volume, each user and project has their own dedicated quota scope directories as shown in :numref:`vfolder-dir-quota`. +The storage solution must support per-directory quota, at least for a single-level (like NetApp's QTree). +We recommend this configuration for filesystems like CephFS, Weka.io, or custom-built storage servers using ZFS or XFS where Backend.AI Storage Proxy can be installed directly onto the storage servers. + +Storage with per-volume quota +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. _vfolder-volume-quota: +.. figure:: vfolder-volume-quota.svg + :width: 72% + :align: center + + Quota scopes and vfolders with storage solutions supporting per-volume quota + +Unfortunately, there are many cases that we cannot rely on per-directory quota support in storage solutions, +due to limitation of the underlying filesystem implementation or having no direct access to the storage vendor APIs. + +For this case, we may assign dedicated storage volumes to each user and project like :numref:`vfolder-volume-quota`, +which *naturally* limits the space usage by the volume size. +Another option is not to configure quota limits, but we don't recommend this option in production setups. + +The shortcoming is that we may need to frequently mount/unmount the network volumes when we create or remove users and projects, which may cause unexpected system failures due to stale file descriptors. + +.. note:: + + For shared vfolders, the quota usage is accounted for the original owner of the vfolder, either a user or a project. + +.. warning:: + + For both schemes, the administrator should take care of the storage solution's system limits such as the maximum number of volumes and quota sets + because such limits may impose a hidden limit to the maximum number of users and projects in Backend.AI. diff --git a/docs/concepts/users.rst b/docs/concepts/users.rst new file mode 100644 index 0000000000..927f392f77 --- /dev/null +++ b/docs/concepts/users.rst @@ -0,0 +1,19 @@ +.. role:: raw-html-m2r(raw) + :format: html + +User Management +=============== + +Users +----- + +Backend.AI's user account has two types of authentication modes: *session* and *keypair*. +The session mode just uses the normal username and password based on browser sessions (e.g., when using the Web UI), while the keypair mode uses a pair of access and secret keys for programmatic access. + +Projects +-------- + +There may be multiple projects created by administrators and users may belong to one or more projects. +Administrators may configure project-level resource policies such as storage quota shared by all project vfolders and project-level artifacts. + +When a user creates a new session, he/she must choose which project to use if he/she belongs to multiple projects to be in line with the resource policies. diff --git a/docs/concepts/vfolder-dir-quota.svg b/docs/concepts/vfolder-dir-quota.svg new file mode 100644 index 0000000000..6e74a2b392 --- /dev/null +++ b/docs/concepts/vfolder-dir-quota.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/concepts/vfolder-volume-quota.svg b/docs/concepts/vfolder-volume-quota.svg new file mode 100644 index 0000000000..3e07f4e4d6 --- /dev/null +++ b/docs/concepts/vfolder-volume-quota.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/dev/daily-workflows.rst b/docs/dev/daily-workflows.rst index 0ed2d89b73..7f065f4226 100644 --- a/docs/dev/daily-workflows.rst +++ b/docs/dev/daily-workflows.rst @@ -316,19 +316,18 @@ Then put the followings in ``.vimrc`` (or ``.nvimrc`` for NeoVim) in the build r let g:ale_fix_on_save = 1 When using CoC, run ``:CocInstall coc-pyright @yaegassy/coc-ruff`` and ``:CocLocalConfig`` after opening a file -in the local working copy to initialize PyRight functionalities. +in the local working copy to initialize Pyright functionalities. In the local configuration file (``.vim/coc-settings.json``), you may put the linter/formatter configurations -just like VSCode (see `the official reference `_): -To activate Ruff (a Python linter and fixer), run ``:CocCommand ruff.builtin.installServer``. +just like VSCode (see `the official reference `_). .. code-block:: json { "coc.preferences.formatOnType": true, - "coc.preferences.formatOnSaveFiletypes": ["python"], + "coc.preferences.formatOnSaveFiletypes": [], // Use the autocmd config "coc.preferences.willSaveHandlerTimeout": 5000, "ruff.enabled": true, - "ruff.autoFixOnSave": false, # Use code actions to fix individual errors + "ruff.autoFixOnSave": false, // Use the autocmd config "ruff.useDetectRuffCommand": false, "ruff.builtin.pythonPath": "dist/export/python/virtualenvs/ruff/3.11.4/bin/python", "ruff.serverPath": "dist/export/python/virtualenvs/ruff/3.11.4/bin/ruff-lsp", @@ -339,6 +338,27 @@ To activate Ruff (a Python linter and fixer), run ``:CocCommand ruff.builtin.ins "python.linting.mypyPath": "dist/export/python/virtualenvs/mypy/3.11.4/bin/mypy", } +To activate Ruff (a Python linter and fixer), run ``:CocCommand ruff.builtin.installServer`` +after opening any Python source file to install the ``ruff-lsp`` server. + +Unfortunately, CoC does not support applying multiple formatters on save, we should call +them serially using ``autocmd``. +To configure it, put the following vimscript as ``.exrc`` in the working copy root: + +.. code-block:: vim + + function! OrganizeAndFormat() + CocCommand ruff.executeOrganizeImports + " Optionally you may apply "ruff.executeAutofix" to apply all possible fixes. + sleep 50m " to avoid races + call CocAction('format') + sleep 50m + endfunction + + augroup autofix + autocmd! + autocmd BufWritePre *.py if &modified | call OrganizeAndFormat() | endif + augroup END Switching between branches ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/requirements.txt b/docs/requirements.txt index 588ea01913..a8ce67f4e0 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,12 +1,11 @@ -Sphinx~=5.0 -sphinxcontrib-trio>=1.1.0 -sphinx-autodoc-typehints~=1.11.1 -sphinx-intl>=2.0 -sphinx-rtd-theme~=1.0 +Sphinx~=7.2 +sphinxcontrib-trio>=1.1.2 +sphinx-autodoc-typehints~=1.24.0 +sphinx-intl>=2.1 +sphinx-rtd-theme~=1.3 sphinxcontrib-mermaid~=0.7.1 sphinxcontrib-openapi~=0.8.1 rtds-action~=1.1.0 -pygments~=2.12 +pygments~=2.16.1 # replace the following line with native support of Sphinx in Pants -r ../requirements.txt --e git+https://github.com/lablup/pygments-graphql-lexer#egg=pygments-graphql-lexer