From f47c1f4dfc345325b09d08aa6e6e8d1ffed5eb43 Mon Sep 17 00:00:00 2001 From: gh-ci-deploy-docs Date: Thu, 17 Oct 2024 06:35:03 +0000 Subject: [PATCH] Deployed 84094f6f to latest with MkDocs 1.2.4 and mike 1.1.2 --- latest/examples/index.html | 2 +- .../examples/iterative-computation/index.html | 2 +- latest/jobs/jobfile/index.html | 32 +++++----- .../apidoc/hyperqueue.client.Client.html | 2 +- ...hyperqueue.client.FailedJobsException.html | 2 +- latest/python/apidoc/hyperqueue.client.html | 2 +- .../hyperqueue.cluster.LocalCluster.html | 2 +- .../hyperqueue.cluster.WorkerConfig.html | 2 +- latest/python/apidoc/hyperqueue.cluster.html | 2 +- latest/python/apidoc/hyperqueue.common.html | 2 +- ...yperqueue.ffi.client.ClientConnection.html | 2 +- ...perqueue.ffi.client.FailedTaskContext.html | 2 +- ...hyperqueue.ffi.client.HqClientContext.html | 2 +- .../python/apidoc/hyperqueue.ffi.client.html | 2 +- .../hyperqueue.ffi.cluster.Cluster.html | 2 +- ...perqueue.ffi.cluster.HqClusterContext.html | 2 +- .../python/apidoc/hyperqueue.ffi.cluster.html | 2 +- latest/python/apidoc/hyperqueue.ffi.html | 2 +- ...yperqueue.ffi.protocol.JobDescription.html | 2 +- ...perqueue.ffi.protocol.ResourceRequest.html | 2 +- ...perqueue.ffi.protocol.TaskDescription.html | 2 +- .../apidoc/hyperqueue.ffi.protocol.html | 2 +- latest/python/apidoc/hyperqueue.html | 2 +- latest/python/apidoc/hyperqueue.job.Job.html | 2 +- .../apidoc/hyperqueue.job.SubmittedJob.html | 2 +- latest/python/apidoc/hyperqueue.job.html | 2 +- .../apidoc/hyperqueue.output.Output.html | 2 +- .../apidoc/hyperqueue.output.StdioDef.html | 2 +- latest/python/apidoc/hyperqueue.output.html | 2 +- .../hyperqueue.task.function.PythonEnv.html | 2 +- ...perqueue.task.function.PythonFunction.html | 2 +- .../apidoc/hyperqueue.task.function.html | 2 +- ...ue.task.function.wrapper.CloudWrapper.html | 2 +- .../hyperqueue.task.function.wrapper.html | 2 +- latest/python/apidoc/hyperqueue.task.html | 2 +- ...perqueue.task.program.ExternalProgram.html | 2 +- .../apidoc/hyperqueue.task.program.html | 2 +- .../apidoc/hyperqueue.task.task.Task.html | 2 +- .../python/apidoc/hyperqueue.task.task.html | 2 +- latest/python/apidoc/hyperqueue.utils.html | 2 +- ...utils.package.MissingPackageException.html | 2 +- .../apidoc/hyperqueue.utils.package.html | 2 +- .../apidoc/hyperqueue.utils.string.html | 2 +- ...rqueue.validation.ValidationException.html | 2 +- .../python/apidoc/hyperqueue.validation.html | 2 +- .../apidoc/hyperqueue.visualization.html | 2 +- latest/python/apidoc/index.html | 2 +- .../apidoc/source+hyperqueue.__init__.py.html | 2 +- .../apidoc/source+hyperqueue.client.py.html | 2 +- ...source+hyperqueue.cluster.__init__.py.html | 2 +- .../apidoc/source+hyperqueue.common.py.html | 2 +- .../source+hyperqueue.ffi.__init__.py.html | 2 +- .../source+hyperqueue.ffi.client.py.html | 2 +- .../source+hyperqueue.ffi.cluster.py.html | 2 +- .../source+hyperqueue.ffi.protocol.py.html | 2 +- .../apidoc/source+hyperqueue.job.py.html | 2 +- .../apidoc/source+hyperqueue.output.py.html | 2 +- ...+hyperqueue.task.function.__init__.py.html | 2 +- ...e+hyperqueue.task.function.wrapper.py.html | 2 +- .../source+hyperqueue.task.program.py.html | 2 +- .../source+hyperqueue.task.task.py.html | 2 +- .../source+hyperqueue.utils.package.py.html | 2 +- .../source+hyperqueue.utils.string.py.html | 2 +- .../source+hyperqueue.validation.py.html | 2 +- .../source+hyperqueue.visualization.py.html | 2 +- latest/search/search_index.json | 2 +- latest/sitemap.xml | 60 +++++++++--------- latest/sitemap.xml.gz | Bin 452 -> 452 bytes 68 files changed, 111 insertions(+), 111 deletions(-) diff --git a/latest/examples/index.html b/latest/examples/index.html index 7ee0a5446..1788e9347 100644 --- a/latest/examples/index.html +++ b/latest/examples/index.html @@ -1 +1 @@ - Examples - HyperQueue
Skip to content

Examples#

Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line interface and also with the Python API.

You can view these examples either in the documentation or on GitHub.


Last update: October 15, 2024
Created: October 15, 2024
Back to top
\ No newline at end of file + Examples - HyperQueue
Skip to content

Examples#

Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line interface and also with the Python API.

You can view these examples either in the documentation or on GitHub.


Last update: October 17, 2024
Created: October 17, 2024
Back to top
\ No newline at end of file diff --git a/latest/examples/iterative-computation/index.html b/latest/examples/iterative-computation/index.html index 5b9e3eac7..dc5981595 100644 --- a/latest/examples/iterative-computation/index.html +++ b/latest/examples/iterative-computation/index.html @@ -32,4 +32,4 @@ break fi done -
Last update: October 15, 2024
Created: October 15, 2024
Back to top
\ No newline at end of file +
Last update: October 17, 2024
Created: October 17, 2024
Back to top
\ No newline at end of file diff --git a/latest/jobs/jobfile/index.html b/latest/jobs/jobfile/index.html index c2ebfbf35..9577aa2f5 100644 --- a/latest/jobs/jobfile/index.html +++ b/latest/jobs/jobfile/index.html @@ -7,23 +7,23 @@ max_fails = 11 [[task]] -stdout = "testout-%{TASK_ID} -stderr = { path = "testerr-%{TASK_ID}", mode = "rm-if-finished" } -task_dir = true -time_limit = "1m 10s" -priority = -1 -crash_limit = 12 -command = ["/bin/bash", "-c", "echo $ABC"] +stdout = "testout-%{TASK_ID}" +stderr = { path = "testerr-%{TASK_ID}", mode = "rm-if-finished" } +task_dir = true +time_limit = "1m 10s" +priority = -1 +crash_limit = 12 +command = ["/bin/bash", "-c", "echo $ABC"] -# Environment variables -env = { "ABC" = "123", "XYZ" = "aaaa" } +# Environment variables +env = { "ABC" = "123", "XYZ" = "aaaa" } -# Content that will be written on stdin -stdin = "Hello world!" +# Content that will be written on stdin +stdin = "Hello world!" -[[task.request]] -resources = { "cpus" = "4 compact!", "gpus" = 2 } -time_request = "10s" +[[task.request]] +resources = { "cpus" = "4 compact!", "gpus" = 2 } +time_request = "10s"

More tasks#

More tasks with different configuration may be defined as follows:

[[task]]
 command = ["sleep", "1"]
 
@@ -60,11 +60,11 @@
 [[task]]
 id = 5
 command = [...]
-deps = [1, 3] # <---- Dependancy on tasks 1 and 3
+deps = [1, 3] # <---- Dependency on tasks 1 and 3
 

Resource variants#

More resource configurations may be defined for a task. In this case, HyperQueue will take into account all these configurations during scheduling. When a task is started exactly one configuration is chosen. If in a given moment more configuration are possible for a given task, the configuration first defined has a higher priority.

The following configuration defines that a task may be executed on 1 cpus and 1 gpu OR on 4 cpus.

[[task]]
 command = [...]
 [[task.request]]
 resources = { "cpus" = 1, "gpus" = 1 }
 [[task.request]]
 resources = { "cpus" = 4 }
-

In the case that many tasks with such a configuration are submitted to a worker with 16 cpus and 4 gpus then HyperQueue will run simultaneously 4 tasks in the first configuration and 3 tasks in the second one.

For a task with resource variants, HyperQueue sets variable HQ_RESOURCE_VARIANT to an index of chosen variant (counted from 0) when a task is started.

Non-integer resource amounts#

You may specify a resource number as float, e.g. resources = { "foo" = 1.5 }. It is valid but internally the type if converted to float, that may for some numbers lead to a rounding up when number is converted to 4-digit precision of resource amounts. If you want to avoid this, put the number into parentheses, e.g. resources = { "foo" = "1.5" }.


Last update: September 16, 2024
Created: March 30, 2023
Back to top
\ No newline at end of file +

In the case that many tasks with such a configuration are submitted to a worker with 16 cpus and 4 gpus then HyperQueue will run simultaneously 4 tasks in the first configuration and 3 tasks in the second one.

For a task with resource variants, HyperQueue sets variable HQ_RESOURCE_VARIANT to an index of chosen variant (counted from 0) when a task is started.

Non-integer resource amounts#

You may specify a resource number as float, e.g. resources = { "foo" = 1.5 }. It is valid but internally the type if converted to float, that may for some numbers lead to a rounding up when number is converted to 4-digit precision of resource amounts. If you want to avoid this, put the number into parentheses, e.g. resources = { "foo" = "1.5" }.


Last update: October 17, 2024
Created: March 30, 2023
Back to top
\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.client.Client.html b/latest/python/apidoc/hyperqueue.client.Client.html index 75b354654..6ad093d22 100644 --- a/latest/python/apidoc/hyperqueue.client.Client.html +++ b/latest/python/apidoc/hyperqueue.client.Client.html @@ -1 +1 @@ -hyperqueue.client.Client

Class Client

A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

Declaration

class Client
source link

Documentation

Methods

  • def __init__(self, server_dir: Optional[GenericPath] = None, ...)

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

    def __init__(
    self,
    server_dir: Optional[GenericPath] = None,
    python_env: Optional[PythonEnv] = None,
    )

    Parameters

    • server_dir: Optional[GenericPath]

      Path to a server directory of a running HyperQueue server.

    • python_env: Optional[PythonEnv]

      Python environment which configures Python tasks created by function.

  • def forget(self, job: HasJobId)

    Forget a completed job to free up its resources from the server.

    Parameters

    • job: HasJobId

      Submitted job (or job ID) that will be forgotten.

  • def get_failed_tasks(self, job: SubmittedJob) -> Dict[TaskId, FailedTaskContext]
  • def submit(self, job: Job) -> SubmittedJob

    Submit a job into HyperQueue.

    Parameters

    • job: Job

      Job that will be submitted.

  • def wait_for_jobs(self, jobs: Sequence[SubmittedJob], raise_on_error=True) -> bool

    Returns True if all tasks were successfully finished

Reexports

\ No newline at end of file +hyperqueue.client.Client

Class Client

A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

Declaration

class Client
source link

Documentation

Methods

  • def __init__(self, server_dir: Optional[GenericPath] = None, ...)

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

    def __init__(
    self,
    server_dir: Optional[GenericPath] = None,
    python_env: Optional[PythonEnv] = None,
    )

    Parameters

    • server_dir: Optional[GenericPath]

      Path to a server directory of a running HyperQueue server.

    • python_env: Optional[PythonEnv]

      Python environment which configures Python tasks created by function.

  • def forget(self, job: HasJobId)

    Forget a completed job to free up its resources from the server.

    Parameters

    • job: HasJobId

      Submitted job (or job ID) that will be forgotten.

  • def get_failed_tasks(self, job: SubmittedJob) -> Dict[TaskId, FailedTaskContext]
  • def submit(self, job: Job) -> SubmittedJob

    Submit a job into HyperQueue.

    Parameters

    • job: Job

      Job that will be submitted.

  • def wait_for_jobs(self, jobs: Sequence[SubmittedJob], raise_on_error=True) -> bool

    Returns True if all tasks were successfully finished

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.client.FailedJobsException.html b/latest/python/apidoc/hyperqueue.client.FailedJobsException.html index 1cefa6afc..aa1f2f1b6 100644 --- a/latest/python/apidoc/hyperqueue.client.FailedJobsException.html +++ b/latest/python/apidoc/hyperqueue.client.FailedJobsException.html @@ -1 +1 @@ -hyperqueue.client.FailedJobsException

Class FailedJobsException

This exception is triggered if a task fails.

Declaration

class FailedJobsException(Exception)
source link

Documentation

Methods

\ No newline at end of file +hyperqueue.client.FailedJobsException

Class FailedJobsException

This exception is triggered if a task fails.

Declaration

class FailedJobsException(Exception)
source link

Documentation

Methods

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.client.html b/latest/python/apidoc/hyperqueue.client.html index fca06dbe7..4a219f760 100644 --- a/latest/python/apidoc/hyperqueue.client.html +++ b/latest/python/apidoc/hyperqueue.client.html @@ -1 +1 @@ -hyperqueue.client

Module client

source link

Classes

  • class Client

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

  • This exception is triggered if a task fails.

Functions

\ No newline at end of file +hyperqueue.client

Module client

source link

Classes

  • class Client

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

  • This exception is triggered if a task fails.

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.cluster.LocalCluster.html b/latest/python/apidoc/hyperqueue.cluster.LocalCluster.html index 893f77389..08f3b157b 100644 --- a/latest/python/apidoc/hyperqueue.cluster.LocalCluster.html +++ b/latest/python/apidoc/hyperqueue.cluster.LocalCluster.html @@ -2,4 +2,4 @@ client = cluster.client() ... # The cluster was stopped -

Methods

Reexports

\ No newline at end of file +

Methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.cluster.WorkerConfig.html b/latest/python/apidoc/hyperqueue.cluster.WorkerConfig.html index fa857bdd0..4fe9db8f0 100644 --- a/latest/python/apidoc/hyperqueue.cluster.WorkerConfig.html +++ b/latest/python/apidoc/hyperqueue.cluster.WorkerConfig.html @@ -1 +1 @@ -hyperqueue.cluster.WorkerConfig

Class WorkerConfig

Configuration of a worker spawned by a local cluster.

Declaration

@dataclasses.dataclass
class WorkerConfig
source link

Documentation

\ No newline at end of file +hyperqueue.cluster.WorkerConfig

Class WorkerConfig

Configuration of a worker spawned by a local cluster.

Declaration

@dataclasses.dataclass
class WorkerConfig
source link

Documentation

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.cluster.html b/latest/python/apidoc/hyperqueue.cluster.html index 088e1fb68..7bbd5a29f 100644 --- a/latest/python/apidoc/hyperqueue.cluster.html +++ b/latest/python/apidoc/hyperqueue.cluster.html @@ -1 +1 @@ -hyperqueue.cluster

Module cluster

source link

Classes

  • class LocalCluster

    Represents a local deployed HyperQueue infrastructure.

  • class WorkerConfig

    Configuration of a worker spawned by a local cluster.

\ No newline at end of file +hyperqueue.cluster

Module cluster

source link

Classes

  • class LocalCluster

    Represents a local deployed HyperQueue infrastructure.

  • class WorkerConfig

    Configuration of a worker spawned by a local cluster.

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.common.html b/latest/python/apidoc/hyperqueue.common.html index 08a71a95d..d8e51a16f 100644 --- a/latest/python/apidoc/hyperqueue.common.html +++ b/latest/python/apidoc/hyperqueue.common.html @@ -1 +1 @@ -hyperqueue.common

Module common

source link
\ No newline at end of file +hyperqueue.common

Module common

source link
\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.client.ClientConnection.html b/latest/python/apidoc/hyperqueue.ffi.client.ClientConnection.html index c1fc4dc05..768c740cd 100644 --- a/latest/python/apidoc/hyperqueue.ffi.client.ClientConnection.html +++ b/latest/python/apidoc/hyperqueue.ffi.client.ClientConnection.html @@ -1 +1 @@ -hyperqueue.ffi.client.ClientConnection

Class ClientConnection

Declaration

class ClientConnection
source link

Methods

Reexports

\ No newline at end of file +hyperqueue.ffi.client.ClientConnection

Class ClientConnection

Declaration

class ClientConnection
source link

Methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.client.FailedTaskContext.html b/latest/python/apidoc/hyperqueue.ffi.client.FailedTaskContext.html index 70d70608e..c2f408fa6 100644 --- a/latest/python/apidoc/hyperqueue.ffi.client.FailedTaskContext.html +++ b/latest/python/apidoc/hyperqueue.ffi.client.FailedTaskContext.html @@ -1 +1 @@ -hyperqueue.ffi.client.FailedTaskContext

Class FailedTaskContext

Declaration

@dataclasses.dataclass(frozen=True)
class FailedTaskContext
source link

Reexports

\ No newline at end of file +hyperqueue.ffi.client.FailedTaskContext

Class FailedTaskContext

Declaration

@dataclasses.dataclass(frozen=True)
class FailedTaskContext
source link

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.client.HqClientContext.html b/latest/python/apidoc/hyperqueue.ffi.client.HqClientContext.html index 0e6cfc559..a90d86722 100644 --- a/latest/python/apidoc/hyperqueue.ffi.client.HqClientContext.html +++ b/latest/python/apidoc/hyperqueue.ffi.client.HqClientContext.html @@ -1 +1 @@ -hyperqueue.ffi.client.HqClientContext

Class HqClientContext

Opaque class returned from connect_to_server. Should be passed to FFI methods that require it.

Declaration

class HqClientContext
source link

Documentation

\ No newline at end of file +hyperqueue.ffi.client.HqClientContext

Class HqClientContext

Opaque class returned from connect_to_server. Should be passed to FFI methods that require it.

Declaration

class HqClientContext
source link

Documentation

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.client.html b/latest/python/apidoc/hyperqueue.ffi.client.html index 847836ed9..0452a2d1e 100644 --- a/latest/python/apidoc/hyperqueue.ffi.client.html +++ b/latest/python/apidoc/hyperqueue.ffi.client.html @@ -1 +1 @@ -hyperqueue.ffi.client

Module client

source link

Classes

\ No newline at end of file +hyperqueue.ffi.client

Module client

source link

Classes

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.cluster.Cluster.html b/latest/python/apidoc/hyperqueue.ffi.cluster.Cluster.html index cb3179145..7cf89e1f3 100644 --- a/latest/python/apidoc/hyperqueue.ffi.cluster.Cluster.html +++ b/latest/python/apidoc/hyperqueue.ffi.cluster.Cluster.html @@ -1 +1 @@ -hyperqueue.ffi.cluster.Cluster

Class Cluster

Declaration

class Cluster
source link

Methods

Reexports

\ No newline at end of file +hyperqueue.ffi.cluster.Cluster

Class Cluster

Declaration

class Cluster
source link

Methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.cluster.HqClusterContext.html b/latest/python/apidoc/hyperqueue.ffi.cluster.HqClusterContext.html index 1e556457e..76fbf17e8 100644 --- a/latest/python/apidoc/hyperqueue.ffi.cluster.HqClusterContext.html +++ b/latest/python/apidoc/hyperqueue.ffi.cluster.HqClusterContext.html @@ -1 +1 @@ -hyperqueue.ffi.cluster.HqClusterContext

Class HqClusterContext

Opaque class returned from cluster_start. Should be passed to FFI methods that require it.

Declaration

class HqClusterContext
source link

Documentation

\ No newline at end of file +hyperqueue.ffi.cluster.HqClusterContext

Class HqClusterContext

Opaque class returned from cluster_start. Should be passed to FFI methods that require it.

Declaration

class HqClusterContext
source link

Documentation

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.cluster.html b/latest/python/apidoc/hyperqueue.ffi.cluster.html index 6b62fe339..da14ab4d4 100644 --- a/latest/python/apidoc/hyperqueue.ffi.cluster.html +++ b/latest/python/apidoc/hyperqueue.ffi.cluster.html @@ -1 +1 @@ -hyperqueue.ffi.cluster

Module cluster

source link

Classes

  • class Cluster
  • Opaque class returned from cluster_start. Should be passed to FFI methods that require it.

\ No newline at end of file +hyperqueue.ffi.cluster

Module cluster

source link

Classes

  • class Cluster
  • Opaque class returned from cluster_start. Should be passed to FFI methods that require it.

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.html b/latest/python/apidoc/hyperqueue.ffi.html index 998cb8daa..8df74d3a5 100644 --- a/latest/python/apidoc/hyperqueue.ffi.html +++ b/latest/python/apidoc/hyperqueue.ffi.html @@ -1 +1 @@ -hyperqueue.ffi

Module ffi

source link

Functions

Submodules

\ No newline at end of file +hyperqueue.ffi

Module ffi

source link

Functions

Submodules

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.protocol.JobDescription.html b/latest/python/apidoc/hyperqueue.ffi.protocol.JobDescription.html index 540ed0646..775ad7477 100644 --- a/latest/python/apidoc/hyperqueue.ffi.protocol.JobDescription.html +++ b/latest/python/apidoc/hyperqueue.ffi.protocol.JobDescription.html @@ -1 +1 @@ -hyperqueue.ffi.protocol.JobDescription

Class JobDescription

Declaration

@dataclasses.dataclass
class JobDescription
source link

Reexports

\ No newline at end of file +hyperqueue.ffi.protocol.JobDescription

Class JobDescription

Declaration

@dataclasses.dataclass
class JobDescription
source link

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.protocol.ResourceRequest.html b/latest/python/apidoc/hyperqueue.ffi.protocol.ResourceRequest.html index 2a8c2b6ce..2db818c07 100644 --- a/latest/python/apidoc/hyperqueue.ffi.protocol.ResourceRequest.html +++ b/latest/python/apidoc/hyperqueue.ffi.protocol.ResourceRequest.html @@ -1 +1 @@ -hyperqueue.ffi.protocol.ResourceRequest

Class ResourceRequest

Declaration

class ResourceRequest
source link

Methods

  • def __init__(self, *, n_nodes=0, cpus: Union[int, float, str] = 1, ...)
    def __init__(
    self,
    *,
    n_nodes=0,
    cpus: Union[int, float, str] = 1,
    resources: Optional[Dict[str, Union[int, float, str]]] = None,
    min_time: Optional[datetime.timedelta] = None,
    )
  • def __repr__(self)

Reexports

\ No newline at end of file +hyperqueue.ffi.protocol.ResourceRequest

Class ResourceRequest

Declaration

class ResourceRequest
source link

Methods

  • def __init__(self, *, n_nodes=0, cpus: Union[int, float, str] = 1, ...)
    def __init__(
    self,
    *,
    n_nodes=0,
    cpus: Union[int, float, str] = 1,
    resources: Optional[Dict[str, Union[int, float, str]]] = None,
    min_time: Optional[datetime.timedelta] = None,
    )
  • def __repr__(self)

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.protocol.TaskDescription.html b/latest/python/apidoc/hyperqueue.ffi.protocol.TaskDescription.html index d4c6e8b7b..2b4c590db 100644 --- a/latest/python/apidoc/hyperqueue.ffi.protocol.TaskDescription.html +++ b/latest/python/apidoc/hyperqueue.ffi.protocol.TaskDescription.html @@ -1 +1 @@ -hyperqueue.ffi.protocol.TaskDescription

Class TaskDescription

Declaration

@dataclasses.dataclass()
class TaskDescription
source link

Reexports

\ No newline at end of file +hyperqueue.ffi.protocol.TaskDescription

Class TaskDescription

Declaration

@dataclasses.dataclass()
class TaskDescription
source link

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.ffi.protocol.html b/latest/python/apidoc/hyperqueue.ffi.protocol.html index 957be636b..430507ef0 100644 --- a/latest/python/apidoc/hyperqueue.ffi.protocol.html +++ b/latest/python/apidoc/hyperqueue.ffi.protocol.html @@ -1 +1 @@ -hyperqueue.ffi.protocol

Module protocol

source link

Classes

\ No newline at end of file +hyperqueue.ffi.protocol

Module protocol

source link

Classes

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.html b/latest/python/apidoc/hyperqueue.html index e79dfc70b..46348f2ee 100644 --- a/latest/python/apidoc/hyperqueue.html +++ b/latest/python/apidoc/hyperqueue.html @@ -1 +1 @@ -hyperqueue

Module hyperqueue

This is the Python API of HyperQueue.

Important classes:

  • Client serves for connecting to a HyperQueue server.
  • LocalCluster can be used to spawn a local HyperQueue cluster.
  • Job describes a job containing a directed acyclic graph of tasks. It can be submitted using a client.
source link

Re-exported Classes

  • class Client

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

    [hyperqueue.client.Client]
  • class Job

    Represents a HQ job.

    [hyperqueue.job.Job]
  • class LocalCluster

    Represents a local deployed HyperQueue infrastructure.

    [hyperqueue.cluster.LocalCluster]

Re-exported Functions

Submodules

\ No newline at end of file +hyperqueue

Module hyperqueue

This is the Python API of HyperQueue.

Important classes:

  • Client serves for connecting to a HyperQueue server.
  • LocalCluster can be used to spawn a local HyperQueue cluster.
  • Job describes a job containing a directed acyclic graph of tasks. It can be submitted using a client.
source link

Re-exported Classes

  • class Client

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

    [hyperqueue.client.Client]
  • class Job

    Represents a HQ job.

    [hyperqueue.job.Job]
  • class LocalCluster

    Represents a local deployed HyperQueue infrastructure.

    [hyperqueue.cluster.LocalCluster]

Re-exported Functions

Submodules

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.job.Job.html b/latest/python/apidoc/hyperqueue.job.Job.html index 787191301..852e954c8 100644 --- a/latest/python/apidoc/hyperqueue.job.Job.html +++ b/latest/python/apidoc/hyperqueue.job.Job.html @@ -1 +1 @@ -hyperqueue.job.Job

Class Job

Represents a HQ job.

Declaration

class Job
source link

Documentation

Methods

  • def __init__(self, default_workdir: Optional[GenericPath] = None, ...)
    def __init__(
    self,
    default_workdir: Optional[GenericPath] = None,
    max_fails: Optional[int] = 1,
    default_env: Optional[EnvType] = None,
    )

    Parameters

    • default_workdir: Optional[GenericPath]

      Default working directory for tasks.

    • max_fails: Optional[int]

      How many tasks can fail before the whole job will be cancelled.

    • default_env: Optional[EnvType]

      Environment variables that will be automatically set for each task in this job.

  • def function(self, fn, *, args=(), kwargs=None, ...) -> PythonFunction

    Creates a new task that will execute the provided Python function.

    def function(
    self,
    fn,
    *,
    args=(),
    kwargs=None,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = default_stdout(),
    stderr: Optional[Stdio] = default_stderr(),
    deps: Sequence[Task] = (),
    name: Optional[str] = None,
    priority: int = 0,
    resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]] = None,
    ) -> PythonFunction

    Parameters

    • args: None

      Positional arguments that will be passed to the Python function.

    • kwargs: None

      Keyword arguments that will be passed to the Python function.

    • env: Optional[EnvType]

      Environment variables passed to the executed command.

    • cwd: Optional[GenericPath]

      Working directory of the executed command.

    • stdout: Optional[Stdio]

      Path to a file that will store the standard output of the executed command.

    • stderr: Optional[Stdio]

      Path to a file that will store the standard error output of the executed command.

    • deps: Sequence[Task]

      A sequence of dependencies that have to be completed first before this task can start executing.

    • name: Optional[str]

      Name of the task.

    • priority: int

      Priority of the created task.

    • resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]]

      List of resource requests required by this task.

  • def program(self, args: ProgramArgs, *, env: Optional[EnvType] = None, ...) -> ExternalProgram

    Creates a new task that will execute the provided command.

    def program(
    self,
    args: ProgramArgs,
    *,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = default_stdout(),
    stderr: Optional[Stdio] = default_stderr(),
    stdin: Optional[Union[str, bytes]] = None,
    deps: Sequence[Task] = (),
    name: Optional[str] = None,
    task_dir: bool = False,
    priority: int = 0,
    resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]] = None,
    ) -> ExternalProgram

    Parameters

    • args: ProgramArgs

      List of arguments will be executed. The arguments have to be strings.

    • env: Optional[EnvType]

      Environment variables passed to the executed command.

    • cwd: Optional[GenericPath]

      Working directory of the executed command.

    • stdout: Optional[Stdio]

      Path to a file that will store the standard output of the executed command.

    • stderr: Optional[Stdio]

      Path to a file that will store the standard error output of the executed command.

    • stdin: Optional[Union[str, bytes]]

      If provided, these bytes will be passed as the standard input of the executed command.

    • deps: Sequence[Task]

      A sequence of dependencies that have to be completed first before this task can start executing.

    • name: Optional[str]

      Name of the task.

    • task_dir: bool

      If True, an isolated directory will be created for the task.

    • priority: int

      Priority of the created task.

    • resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]]

      List of resource requests required by this task.

  • def task_by_id(self, id: TaskId) -> Optional[Task]

    Finds a task with the given ID.

Reexports

\ No newline at end of file +hyperqueue.job.Job

Class Job

Represents a HQ job.

Declaration

class Job
source link

Documentation

Methods

  • def __init__(self, default_workdir: Optional[GenericPath] = None, ...)
    def __init__(
    self,
    default_workdir: Optional[GenericPath] = None,
    max_fails: Optional[int] = 1,
    default_env: Optional[EnvType] = None,
    )

    Parameters

    • default_workdir: Optional[GenericPath]

      Default working directory for tasks.

    • max_fails: Optional[int]

      How many tasks can fail before the whole job will be cancelled.

    • default_env: Optional[EnvType]

      Environment variables that will be automatically set for each task in this job.

  • def function(self, fn, *, args=(), kwargs=None, ...) -> PythonFunction

    Creates a new task that will execute the provided Python function.

    def function(
    self,
    fn,
    *,
    args=(),
    kwargs=None,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = default_stdout(),
    stderr: Optional[Stdio] = default_stderr(),
    deps: Sequence[Task] = (),
    name: Optional[str] = None,
    priority: int = 0,
    resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]] = None,
    ) -> PythonFunction

    Parameters

    • args: None

      Positional arguments that will be passed to the Python function.

    • kwargs: None

      Keyword arguments that will be passed to the Python function.

    • env: Optional[EnvType]

      Environment variables passed to the executed command.

    • cwd: Optional[GenericPath]

      Working directory of the executed command.

    • stdout: Optional[Stdio]

      Path to a file that will store the standard output of the executed command.

    • stderr: Optional[Stdio]

      Path to a file that will store the standard error output of the executed command.

    • deps: Sequence[Task]

      A sequence of dependencies that have to be completed first before this task can start executing.

    • name: Optional[str]

      Name of the task.

    • priority: int

      Priority of the created task.

    • resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]]

      List of resource requests required by this task.

  • def program(self, args: ProgramArgs, *, env: Optional[EnvType] = None, ...) -> ExternalProgram

    Creates a new task that will execute the provided command.

    def program(
    self,
    args: ProgramArgs,
    *,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = default_stdout(),
    stderr: Optional[Stdio] = default_stderr(),
    stdin: Optional[Union[str, bytes]] = None,
    deps: Sequence[Task] = (),
    name: Optional[str] = None,
    task_dir: bool = False,
    priority: int = 0,
    resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]] = None,
    ) -> ExternalProgram

    Parameters

    • args: ProgramArgs

      List of arguments will be executed. The arguments have to be strings.

    • env: Optional[EnvType]

      Environment variables passed to the executed command.

    • cwd: Optional[GenericPath]

      Working directory of the executed command.

    • stdout: Optional[Stdio]

      Path to a file that will store the standard output of the executed command.

    • stderr: Optional[Stdio]

      Path to a file that will store the standard error output of the executed command.

    • stdin: Optional[Union[str, bytes]]

      If provided, these bytes will be passed as the standard input of the executed command.

    • deps: Sequence[Task]

      A sequence of dependencies that have to be completed first before this task can start executing.

    • name: Optional[str]

      Name of the task.

    • task_dir: bool

      If True, an isolated directory will be created for the task.

    • priority: int

      Priority of the created task.

    • resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]]

      List of resource requests required by this task.

  • def task_by_id(self, id: TaskId) -> Optional[Task]

    Finds a task with the given ID.

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.job.SubmittedJob.html b/latest/python/apidoc/hyperqueue.job.SubmittedJob.html index d99f5fe9a..66eb24e23 100644 --- a/latest/python/apidoc/hyperqueue.job.SubmittedJob.html +++ b/latest/python/apidoc/hyperqueue.job.SubmittedJob.html @@ -1 +1 @@ -hyperqueue.job.SubmittedJob

Class SubmittedJob

Successfully submitted job.

Declaration

@dataclasses.dataclass
class SubmittedJob
source link

Documentation

Reexports

\ No newline at end of file +hyperqueue.job.SubmittedJob

Class SubmittedJob

Successfully submitted job.

Declaration

@dataclasses.dataclass
class SubmittedJob
source link

Documentation

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.job.html b/latest/python/apidoc/hyperqueue.job.html index 5dfb7a133..02cd6b564 100644 --- a/latest/python/apidoc/hyperqueue.job.html +++ b/latest/python/apidoc/hyperqueue.job.html @@ -1 +1 @@ -hyperqueue.job

Module job

source link

Classes

  • class Job

    Represents a HQ job.

  • class SubmittedJob

    Successfully submitted job.

Functions

\ No newline at end of file +hyperqueue.job

Module job

source link

Classes

  • class Job

    Represents a HQ job.

  • class SubmittedJob

    Successfully submitted job.

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.output.Output.html b/latest/python/apidoc/hyperqueue.output.Output.html index c3a80e1ee..354a9eaf0 100644 --- a/latest/python/apidoc/hyperqueue.output.Output.html +++ b/latest/python/apidoc/hyperqueue.output.Output.html @@ -1 +1 @@ -hyperqueue.output.Output

Class Output

Declaration

class Output
source link

Methods

  • def __init__(self, name: str, filepath: Optional[str] = None, extension: Optional[str] = None)

Reexports

\ No newline at end of file +hyperqueue.output.Output

Class Output

Declaration

class Output
source link

Methods

  • def __init__(self, name: str, filepath: Optional[str] = None, extension: Optional[str] = None)

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.output.StdioDef.html b/latest/python/apidoc/hyperqueue.output.StdioDef.html index 8836fd5ce..c26b8cff6 100644 --- a/latest/python/apidoc/hyperqueue.output.StdioDef.html +++ b/latest/python/apidoc/hyperqueue.output.StdioDef.html @@ -1 +1 @@ -hyperqueue.output.StdioDef

Class StdioDef

If path is None, then the default HQ path will be used.

Declaration

@dataclasses.dataclass
class StdioDef
source link

Documentation

Class methods

  • def from_path(path: GenericPath) -> StdioDef @staticmethod
    @staticmethod
    def from_path(path: GenericPath)
  • def remove_if_finished(path: Optional[GenericPath] = None) -> StdioDef @staticmethod
    @staticmethod
    def remove_if_finished(path: Optional[GenericPath] = None)

Reexports

\ No newline at end of file +hyperqueue.output.StdioDef

Class StdioDef

If path is None, then the default HQ path will be used.

Declaration

@dataclasses.dataclass
class StdioDef
source link

Documentation

Class methods

  • def from_path(path: GenericPath) -> StdioDef @staticmethod
    @staticmethod
    def from_path(path: GenericPath)
  • def remove_if_finished(path: Optional[GenericPath] = None) -> StdioDef @staticmethod
    @staticmethod
    def remove_if_finished(path: Optional[GenericPath] = None)

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.output.html b/latest/python/apidoc/hyperqueue.output.html index d9b5b66a0..27cf6aa88 100644 --- a/latest/python/apidoc/hyperqueue.output.html +++ b/latest/python/apidoc/hyperqueue.output.html @@ -1 +1 @@ -hyperqueue.output

Module output

source link

Classes

  • class Output
  • class StdioDef

    If path is None, then the default HQ path will be used.

Functions

\ No newline at end of file +hyperqueue.output

Module output

source link

Classes

  • class Output
  • class StdioDef

    If path is None, then the default HQ path will be used.

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.function.PythonEnv.html b/latest/python/apidoc/hyperqueue.task.function.PythonEnv.html index 72874ab5b..fc5ef0d0c 100644 --- a/latest/python/apidoc/hyperqueue.task.function.PythonEnv.html +++ b/latest/python/apidoc/hyperqueue.task.function.PythonEnv.html @@ -1 +1 @@ -hyperqueue.task.function.PythonEnv

Class PythonEnv

Describes an environment for spawning Python interpreters.

Declaration

class PythonEnv
source link

Documentation

Methods

  • def __init__(self, python_bin: str = "python3", prologue: Optional[str] = None, ...)

    Describes an environment for spawning Python interpreters.

    def __init__(
    self,
    python_bin: str = "python3",
    prologue: Optional[str] = None,
    shell: str = "bash",
    )

    Parameters

    • python_bin: str

      Python binary that will be executed.

    • prologue: Optional[str]

      Shell command that will be executed prior to launching the Python interpreter.

    • shell: str

      Shell used for executing prologue.

Reexports

\ No newline at end of file +hyperqueue.task.function.PythonEnv

Class PythonEnv

Describes an environment for spawning Python interpreters.

Declaration

class PythonEnv
source link

Documentation

Methods

  • def __init__(self, python_bin: str = "python3", prologue: Optional[str] = None, ...)

    Describes an environment for spawning Python interpreters.

    def __init__(
    self,
    python_bin: str = "python3",
    prologue: Optional[str] = None,
    shell: str = "bash",
    )

    Parameters

    • python_bin: str

      Python binary that will be executed.

    • prologue: Optional[str]

      Shell command that will be executed prior to launching the Python interpreter.

    • shell: str

      Shell used for executing prologue.

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.function.PythonFunction.html b/latest/python/apidoc/hyperqueue.task.function.PythonFunction.html index 8fcd0ce82..2338d02ad 100644 --- a/latest/python/apidoc/hyperqueue.task.function.PythonFunction.html +++ b/latest/python/apidoc/hyperqueue.task.function.PythonFunction.html @@ -1 +1 @@ -hyperqueue.task.function.PythonFunction

Class PythonFunction

Task that represents the execution of a Python function.

Declaration

class PythonFunction(Task)
source link

Documentation

Methods

  • def __init__(self, task_id: TaskId, fn: Callable, *, args=(), kwargs=None, ...) override
    def __init__(
    self,
    task_id: TaskId,
    fn: Callable,
    *,
    args=(),
    kwargs=None,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[GenericPath] = None,
    stderr: Optional[GenericPath] = None,
    name: Optional[str] = None,
    dependencies=(),
    priority: int = 0,
    resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]] = None,
    )

    This method overrides hyperqueue.task.task.Task.__init__.

  • def __repr__(self)

Inherited methods

Reexports

\ No newline at end of file +hyperqueue.task.function.PythonFunction

Class PythonFunction

Task that represents the execution of a Python function.

Declaration

class PythonFunction(Task)
source link

Documentation

Methods

  • def __init__(self, task_id: TaskId, fn: Callable, *, args=(), kwargs=None, ...) override
    def __init__(
    self,
    task_id: TaskId,
    fn: Callable,
    *,
    args=(),
    kwargs=None,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[GenericPath] = None,
    stderr: Optional[GenericPath] = None,
    name: Optional[str] = None,
    dependencies=(),
    priority: int = 0,
    resources: Optional[Union[ResourceRequest, Sequence[ResourceRequest]]] = None,
    )

    This method overrides hyperqueue.task.task.Task.__init__.

  • def __repr__(self)

Inherited methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.function.html b/latest/python/apidoc/hyperqueue.task.function.html index 0b829ca3f..ba8acfd0f 100644 --- a/latest/python/apidoc/hyperqueue.task.function.html +++ b/latest/python/apidoc/hyperqueue.task.function.html @@ -1 +1 @@ -hyperqueue.task.function

Module function

source link

Classes

  • class PythonEnv

    Describes an environment for spawning Python interpreters.

  • Task that represents the execution of a Python function.

Re-exported Classes

  • class CloudWrapper

    Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.

    [hyperqueue.task.function.wrapper.CloudWrapper]

Functions

Submodules

\ No newline at end of file +hyperqueue.task.function

Module function

source link

Classes

  • class PythonEnv

    Describes an environment for spawning Python interpreters.

  • Task that represents the execution of a Python function.

Re-exported Classes

  • class CloudWrapper

    Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.

    [hyperqueue.task.function.wrapper.CloudWrapper]

Functions

Submodules

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.function.wrapper.CloudWrapper.html b/latest/python/apidoc/hyperqueue.task.function.wrapper.CloudWrapper.html index fe234454b..21af1153b 100644 --- a/latest/python/apidoc/hyperqueue.task.function.wrapper.CloudWrapper.html +++ b/latest/python/apidoc/hyperqueue.task.function.wrapper.CloudWrapper.html @@ -1 +1 @@ -hyperqueue.task.function.wrapper.CloudWrapper

Class CloudWrapper

Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.

Declaration

class CloudWrapper
source link

Documentation

Methods

Reexports

\ No newline at end of file +hyperqueue.task.function.wrapper.CloudWrapper

Class CloudWrapper

Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.

Declaration

class CloudWrapper
source link

Documentation

Methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.function.wrapper.html b/latest/python/apidoc/hyperqueue.task.function.wrapper.html index c54318cae..c1d273cf3 100644 --- a/latest/python/apidoc/hyperqueue.task.function.wrapper.html +++ b/latest/python/apidoc/hyperqueue.task.function.wrapper.html @@ -1 +1 @@ -hyperqueue.task.function.wrapper

Module wrapper

source link

Classes

  • class CloudWrapper

    Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.

\ No newline at end of file +hyperqueue.task.function.wrapper

Module wrapper

source link

Classes

  • class CloudWrapper

    Wraps a callable so that cloudpickle is used to pickle it, caching the pickle.

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.html b/latest/python/apidoc/hyperqueue.task.html index 650296867..89a882c65 100644 --- a/latest/python/apidoc/hyperqueue.task.html +++ b/latest/python/apidoc/hyperqueue.task.html @@ -1 +1 @@ -hyperqueue.task

Module task

source link

Submodules

\ No newline at end of file +hyperqueue.task

Module task

source link

Submodules

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.program.ExternalProgram.html b/latest/python/apidoc/hyperqueue.task.program.ExternalProgram.html index 06e3ac8a9..6342e3965 100644 --- a/latest/python/apidoc/hyperqueue.task.program.ExternalProgram.html +++ b/latest/python/apidoc/hyperqueue.task.program.ExternalProgram.html @@ -1 +1 @@ -hyperqueue.task.program.ExternalProgram

Class ExternalProgram

Task that represents the execution of an executable binary.

Declaration

class ExternalProgram(Task)
source link

Documentation

Methods

  • def __getitem__(self, key: str)
  • def __init__(self, task_id: TaskId, *, args: List[str], env: Optional[EnvType] = None, ...) override
    def __init__(
    self,
    task_id: TaskId,
    *,
    args: List[str],
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = None,
    stderr: Optional[Stdio] = None,
    stdin: Optional[Union[str, bytes]] = None,
    name: Optional[str] = None,
    dependencies: Sequence[Task] = (),
    task_dir: bool = False,
    priority: int = 0,
    resources: Optional[ResourceRequest],
    )

    This method overrides hyperqueue.task.task.Task.__init__.

  • def __repr__(self)

Inherited methods

Reexports

\ No newline at end of file +hyperqueue.task.program.ExternalProgram

Class ExternalProgram

Task that represents the execution of an executable binary.

Declaration

class ExternalProgram(Task)
source link

Documentation

Methods

  • def __getitem__(self, key: str)
  • def __init__(self, task_id: TaskId, *, args: List[str], env: Optional[EnvType] = None, ...) override
    def __init__(
    self,
    task_id: TaskId,
    *,
    args: List[str],
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = None,
    stderr: Optional[Stdio] = None,
    stdin: Optional[Union[str, bytes]] = None,
    name: Optional[str] = None,
    dependencies: Sequence[Task] = (),
    task_dir: bool = False,
    priority: int = 0,
    resources: Optional[ResourceRequest],
    )

    This method overrides hyperqueue.task.task.Task.__init__.

  • def __repr__(self)

Inherited methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.program.html b/latest/python/apidoc/hyperqueue.task.program.html index 8f1e7bd4f..37695e21c 100644 --- a/latest/python/apidoc/hyperqueue.task.program.html +++ b/latest/python/apidoc/hyperqueue.task.program.html @@ -1 +1 @@ -hyperqueue.task.program

Module program

source link

Classes

  • Task that represents the execution of an executable binary.

Functions

\ No newline at end of file +hyperqueue.task.program

Module program

source link

Classes

  • Task that represents the execution of an executable binary.

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.task.Task.html b/latest/python/apidoc/hyperqueue.task.task.Task.html index 4c5e113c5..0a8b5357c 100644 --- a/latest/python/apidoc/hyperqueue.task.task.Task.html +++ b/latest/python/apidoc/hyperqueue.task.task.Task.html @@ -1 +1 @@ -hyperqueue.task.task.Task

Class Task

Declaration

class Task
source link

Methods

  • def __init__(self, task_id: TaskId, dependencies: Sequence["Task"] = (), ...)
    def __init__(
    self,
    task_id: TaskId,
    dependencies: Sequence["Task"] = (),
    priority: int = 0,
    resources: Optional[ResourceRequest] = None,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = None,
    stderr: Optional[Stdio] = None,
    name: Optional[str] = None,
    )

    Overrides

    This method is overriden in:

  • def label(self) -> str @property

    Returns the label of the task. If the task has an assigned name, the label is equal to the name. Otherwise, the label is the ID of the task converted to a string.

    @property
    def label(self)

Subclasses

Reexports

\ No newline at end of file +hyperqueue.task.task.Task

Class Task

Declaration

class Task
source link

Methods

  • def __init__(self, task_id: TaskId, dependencies: Sequence["Task"] = (), ...)
    def __init__(
    self,
    task_id: TaskId,
    dependencies: Sequence["Task"] = (),
    priority: int = 0,
    resources: Optional[ResourceRequest] = None,
    env: Optional[EnvType] = None,
    cwd: Optional[GenericPath] = None,
    stdout: Optional[Stdio] = None,
    stderr: Optional[Stdio] = None,
    name: Optional[str] = None,
    )

    Overrides

    This method is overriden in:

  • def label(self) -> str @property

    Returns the label of the task. If the task has an assigned name, the label is equal to the name. Otherwise, the label is the ID of the task converted to a string.

    @property
    def label(self)

Subclasses

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.task.task.html b/latest/python/apidoc/hyperqueue.task.task.html index e635688bf..ae606c779 100644 --- a/latest/python/apidoc/hyperqueue.task.task.html +++ b/latest/python/apidoc/hyperqueue.task.task.html @@ -1 +1 @@ -hyperqueue.task.task

Module task

source link

Classes

Functions

\ No newline at end of file +hyperqueue.task.task

Module task

source link

Classes

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.utils.html b/latest/python/apidoc/hyperqueue.utils.html index 460ea4505..89777cccd 100644 --- a/latest/python/apidoc/hyperqueue.utils.html +++ b/latest/python/apidoc/hyperqueue.utils.html @@ -1 +1 @@ -hyperqueue.utils

Module utils

source link

Submodules

\ No newline at end of file +hyperqueue.utils

Module utils

source link

Submodules

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.utils.package.MissingPackageException.html b/latest/python/apidoc/hyperqueue.utils.package.MissingPackageException.html index a83c3d87b..622baea1a 100644 --- a/latest/python/apidoc/hyperqueue.utils.package.MissingPackageException.html +++ b/latest/python/apidoc/hyperqueue.utils.package.MissingPackageException.html @@ -1 +1 @@ -hyperqueue.utils.package.MissingPackageException

Class MissingPackageException

Declaration

class MissingPackageException(BaseException)
source link

Methods

Reexports

\ No newline at end of file +hyperqueue.utils.package.MissingPackageException

Class MissingPackageException

Declaration

class MissingPackageException(BaseException)
source link

Methods

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.utils.package.html b/latest/python/apidoc/hyperqueue.utils.package.html index c978f2f83..ea74b16f7 100644 --- a/latest/python/apidoc/hyperqueue.utils.package.html +++ b/latest/python/apidoc/hyperqueue.utils.package.html @@ -1 +1 @@ -hyperqueue.utils.package

Module package

source link

Classes

\ No newline at end of file +hyperqueue.utils.package

Module package

source link

Classes

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.utils.string.html b/latest/python/apidoc/hyperqueue.utils.string.html index e315763e6..5e0febb74 100644 --- a/latest/python/apidoc/hyperqueue.utils.string.html +++ b/latest/python/apidoc/hyperqueue.utils.string.html @@ -1 +1 @@ -hyperqueue.utils.string

Module string

source link

Functions

\ No newline at end of file +hyperqueue.utils.string

Module string

source link

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.validation.ValidationException.html b/latest/python/apidoc/hyperqueue.validation.ValidationException.html index 4ce48c13b..bd7e65fea 100644 --- a/latest/python/apidoc/hyperqueue.validation.ValidationException.html +++ b/latest/python/apidoc/hyperqueue.validation.ValidationException.html @@ -1 +1 @@ -hyperqueue.validation.ValidationException

Class ValidationException

Declaration

class ValidationException(BaseException)
source link

Reexports

\ No newline at end of file +hyperqueue.validation.ValidationException

Class ValidationException

Declaration

class ValidationException(BaseException)
source link

Reexports

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.validation.html b/latest/python/apidoc/hyperqueue.validation.html index 24589e5c9..db82c3e3d 100644 --- a/latest/python/apidoc/hyperqueue.validation.html +++ b/latest/python/apidoc/hyperqueue.validation.html @@ -1 +1 @@ -hyperqueue.validation

Module validation

source link

Classes

Functions

\ No newline at end of file +hyperqueue.validation

Module validation

source link

Classes

Functions

\ No newline at end of file diff --git a/latest/python/apidoc/hyperqueue.visualization.html b/latest/python/apidoc/hyperqueue.visualization.html index b42cd271e..3f3d7bd20 100644 --- a/latest/python/apidoc/hyperqueue.visualization.html +++ b/latest/python/apidoc/hyperqueue.visualization.html @@ -1 +1 @@ -hyperqueue.visualization

Module visualization

source link

Functions

  • def visualize_job(job: Job, path: GenericPath)

    Visualizes the task graph of the passed job in the DOT format. The result is written to a file located at path.

    Note: this function requires the pydot package to be installed.

\ No newline at end of file +hyperqueue.visualization

Module visualization

source link

Functions

  • def visualize_job(job: Job, path: GenericPath)

    Visualizes the task graph of the passed job in the DOT format. The result is written to a file located at path.

    Note: this function requires the pydot package to be installed.

\ No newline at end of file diff --git a/latest/python/apidoc/index.html b/latest/python/apidoc/index.html index e79dfc70b..46348f2ee 100644 --- a/latest/python/apidoc/index.html +++ b/latest/python/apidoc/index.html @@ -1 +1 @@ -hyperqueue

Module hyperqueue

This is the Python API of HyperQueue.

Important classes:

  • Client serves for connecting to a HyperQueue server.
  • LocalCluster can be used to spawn a local HyperQueue cluster.
  • Job describes a job containing a directed acyclic graph of tasks. It can be submitted using a client.
source link

Re-exported Classes

  • class Client

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

    [hyperqueue.client.Client]
  • class Job

    Represents a HQ job.

    [hyperqueue.job.Job]
  • class LocalCluster

    Represents a local deployed HyperQueue infrastructure.

    [hyperqueue.cluster.LocalCluster]

Re-exported Functions

Submodules

\ No newline at end of file +hyperqueue

Module hyperqueue

This is the Python API of HyperQueue.

Important classes:

  • Client serves for connecting to a HyperQueue server.
  • LocalCluster can be used to spawn a local HyperQueue cluster.
  • Job describes a job containing a directed acyclic graph of tasks. It can be submitted using a client.
source link

Re-exported Classes

  • class Client

    A client serves as a gateway for submitting jobs and querying information about a running HyperQueue server.

    [hyperqueue.client.Client]
  • class Job

    Represents a HQ job.

    [hyperqueue.job.Job]
  • class LocalCluster

    Represents a local deployed HyperQueue infrastructure.

    [hyperqueue.cluster.LocalCluster]

Re-exported Functions

Submodules

\ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.__init__.py.html b/latest/python/apidoc/source+hyperqueue.__init__.py.html index f0b83aba8..7349cab83 100644 --- a/latest/python/apidoc/source+hyperqueue.__init__.py.html +++ b/latest/python/apidoc/source+hyperqueue.__init__.py.html @@ -31,4 +31,4 @@ from .cluster import LocalCluster # noqa: F401 from .ffi import get_version # noqa: F401 from .job import Job # noqa: F401 - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.client.py.html b/latest/python/apidoc/source+hyperqueue.client.py.html index a299f031c..d5f133601 100644 --- a/latest/python/apidoc/source+hyperqueue.client.py.html +++ b/latest/python/apidoc/source+hyperqueue.client.py.html @@ -285,4 +285,4 @@ bar.update(delta) return cb - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.cluster.__init__.py.html b/latest/python/apidoc/source+hyperqueue.cluster.__init__.py.html index a5b4f3ac2..e63de0b50 100644 --- a/latest/python/apidoc/source+hyperqueue.cluster.__init__.py.html +++ b/latest/python/apidoc/source+hyperqueue.cluster.__init__.py.html @@ -143,4 +143,4 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.stop() - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.common.py.html b/latest/python/apidoc/source+hyperqueue.common.py.html index 28c0ad1ec..c83e513cf 100644 --- a/latest/python/apidoc/source+hyperqueue.common.py.html +++ b/latest/python/apidoc/source+hyperqueue.common.py.html @@ -5,4 +5,4 @@ from typing import Union GenericPath = Union[Path, str] - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.ffi.__init__.py.html b/latest/python/apidoc/source+hyperqueue.ffi.__init__.py.html index 3ec0e9522..d4c601acd 100644 --- a/latest/python/apidoc/source+hyperqueue.ffi.__init__.py.html +++ b/latest/python/apidoc/source+hyperqueue.ffi.__init__.py.html @@ -13,4 +13,4 @@ def get_version() -> str: return ffi.get_hq_version() - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.ffi.client.py.html b/latest/python/apidoc/source+hyperqueue.ffi.client.py.html index 3ec1c0a7d..8c02f5659 100644 --- a/latest/python/apidoc/source+hyperqueue.ffi.client.py.html +++ b/latest/python/apidoc/source+hyperqueue.ffi.client.py.html @@ -111,4 +111,4 @@ def forget_job(self, job_id: JobId): return ffi.forget_job(self.ctx, job_id) - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.ffi.cluster.py.html b/latest/python/apidoc/source+hyperqueue.ffi.cluster.py.html index 3555a09ab..8e410a2ae 100644 --- a/latest/python/apidoc/source+hyperqueue.ffi.cluster.py.html +++ b/latest/python/apidoc/source+hyperqueue.ffi.cluster.py.html @@ -49,4 +49,4 @@ def stop(self): return self.ctx.stop() - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.ffi.protocol.py.html b/latest/python/apidoc/source+hyperqueue.ffi.protocol.py.html index 5e13ee186..20d0c30a1 100644 --- a/latest/python/apidoc/source+hyperqueue.ffi.protocol.py.html +++ b/latest/python/apidoc/source+hyperqueue.ffi.protocol.py.html @@ -97,4 +97,4 @@ class JobDescription: tasks: List[TaskDescription] max_fails: Optional[int] - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.job.py.html b/latest/python/apidoc/source+hyperqueue.job.py.html index a251db3b3..2cb916d0c 100644 --- a/latest/python/apidoc/source+hyperqueue.job.py.html +++ b/latest/python/apidoc/source+hyperqueue.job.py.html @@ -353,4 +353,4 @@ if env: environment.update(env) return environment - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.output.py.html b/latest/python/apidoc/source+hyperqueue.output.py.html index 4bfdcd837..3142994cc 100644 --- a/latest/python/apidoc/source+hyperqueue.output.py.html +++ b/latest/python/apidoc/source+hyperqueue.output.py.html @@ -171,4 +171,4 @@ elif isinstance(collection, Output): items = [collection] return [item for item in items if isinstance(item, Output)] - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.task.function.__init__.py.html b/latest/python/apidoc/source+hyperqueue.task.function.__init__.py.html index bb906599e..d611ac042 100644 --- a/latest/python/apidoc/source+hyperqueue.task.function.__init__.py.html +++ b/latest/python/apidoc/source+hyperqueue.task.function.__init__.py.html @@ -309,4 +309,4 @@ if hasattr(fn, "__name__"): return f"{fn.__name__}/{id}" return None - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.task.function.wrapper.py.html b/latest/python/apidoc/source+hyperqueue.task.function.wrapper.py.html index 37e877cfa..0a88094fb 100644 --- a/latest/python/apidoc/source+hyperqueue.task.function.wrapper.py.html +++ b/latest/python/apidoc/source+hyperqueue.task.function.wrapper.py.html @@ -121,4 +121,4 @@ self.__class__, (None, self._get_pickled_fn(), self.cache, self.protocol), ) - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.task.program.py.html b/latest/python/apidoc/source+hyperqueue.task.program.py.html index db697836e..8ca0a4570 100644 --- a/latest/python/apidoc/source+hyperqueue.task.program.py.html +++ b/latest/python/apidoc/source+hyperqueue.task.program.py.html @@ -197,4 +197,4 @@ raise ValidationException(f"Output `{output.name}` has been defined multiple times") output_map[output.name] = output return output_map - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.task.task.py.html b/latest/python/apidoc/source+hyperqueue.task.task.py.html index 6ecb62747..4fd841a2d 100644 --- a/latest/python/apidoc/source+hyperqueue.task.task.py.html +++ b/latest/python/apidoc/source+hyperqueue.task.task.py.html @@ -139,4 +139,4 @@ raise ValidationException( f"Invalid value provided for `{stream}`: {type(stdio)}. Expected str, Path or `StdioDef` or `None`." ) - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.utils.package.py.html b/latest/python/apidoc/source+hyperqueue.utils.package.py.html index 409e78600..7ad95217c 100644 --- a/latest/python/apidoc/source+hyperqueue.utils.package.py.html +++ b/latest/python/apidoc/source+hyperqueue.utils.package.py.html @@ -9,4 +9,4 @@ def __str__(self): return f"Unable to import `{self.package}`. You have to install the `{self.package}` package." - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.utils.string.py.html b/latest/python/apidoc/source+hyperqueue.utils.string.py.html index 093da67d2..d876d0f50 100644 --- a/latest/python/apidoc/source+hyperqueue.utils.string.py.html +++ b/latest/python/apidoc/source+hyperqueue.utils.string.py.html @@ -5,4 +5,4 @@ if count == 1: return text return f"{text}s" - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.validation.py.html b/latest/python/apidoc/source+hyperqueue.validation.py.html index ea97ad96e..d056238db 100644 --- a/latest/python/apidoc/source+hyperqueue.validation.py.html +++ b/latest/python/apidoc/source+hyperqueue.validation.py.html @@ -29,4 +29,4 @@ "Each program argument must either be a string or an instance of `hq.Output`. " f"Argument `{arg}` has type `{type(arg)}`." ) - \ No newline at end of file + \ No newline at end of file diff --git a/latest/python/apidoc/source+hyperqueue.visualization.py.html b/latest/python/apidoc/source+hyperqueue.visualization.py.html index f0d40b956..0da7b3a33 100644 --- a/latest/python/apidoc/source+hyperqueue.visualization.py.html +++ b/latest/python/apidoc/source+hyperqueue.visualization.py.html @@ -79,4 +79,4 @@ visit(task) graph.write(path) - \ No newline at end of file + \ No newline at end of file diff --git a/latest/search/search_index.json b/latest/search/search_index.json index a6d5cf470..350e62204 100644 --- a/latest/search/search_index.json +++ b/latest/search/search_index.json @@ -1 +1 @@ -{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"HyperQueue is a tool designed to simplify execution of large workflows (task graphs) on HPC clusters. It allows you to execute a large number of tasks in a simple way, without having to manually submit jobs into batch schedulers like Slurm or PBS. You just specify what you want to compute \u2013 HyperQueue will automatically ask for computational resources and dynamically load-balance tasks across all allocated nodes and cores. HyperQueue can also work without Slurm/PBS as a general task executor. If you use HyperQueue in your research, please consider citing it . Useful links # Installation Quick start Python API Repository Discussion forum Zulip (chat platform) Features # Resource management Batch jobs are submitted and managed automatically Computation is distributed amongst all allocated nodes and cores Tasks can specify resource requirements (# of cores, GPUs, memory, ...) Performance Scales to millions of tasks and hundreds of nodes Overhead per task is around 0.1 ms Task output can be streamed to a single file to avoid overloading distributed filesystems Simple deployment HQ is provided as a single, statically linked binary without any dependencies No admin access to a cluster is needed","title":"Overview"},{"location":"#useful-links","text":"Installation Quick start Python API Repository Discussion forum Zulip (chat platform)","title":"Useful links"},{"location":"#features","text":"Resource management Batch jobs are submitted and managed automatically Computation is distributed amongst all allocated nodes and cores Tasks can specify resource requirements (# of cores, GPUs, memory, ...) Performance Scales to millions of tasks and hundreds of nodes Overhead per task is around 0.1 ms Task output can be streamed to a single file to avoid overloading distributed filesystems Simple deployment HQ is provided as a single, statically linked binary without any dependencies No admin access to a cluster is needed","title":"Features"},{"location":"cheatsheet/","text":"Cheatsheet # Here you can find a cheatsheet with the most basic HQ commands.","title":"Cheatsheet"},{"location":"cheatsheet/#cheatsheet","text":"Here you can find a cheatsheet with the most basic HQ commands.","title":"Cheatsheet"},{"location":"faq/","text":"FAQ # Here you can find a list of frequently asked questions about HyperQueue. If you'd like to ask about anything related to HyperQueue, feel free to ask on our discussion forum or on our Zulip server . HQ fundamentals # How does HQ work? You start a HQ server somewhere (e.g. a login node or a cloud partition of a cluster). Then you can submit your jobs containing tasks to the server. You may have hundreds of thousands of tasks; they may have various CPUs and other resource requirements. Then you can connect any number of HQ workers to the server (either manually or via SLURM/PBS). The server will then immediately start to assign tasks to them. Workers are fully and dynamically controlled by server; you do not need to specify what tasks are executed on a particular worker or preconfigure it in any way. HQ provides a command line tool for submitting and controlling jobs. What is a task in HQ? Task is a unit of computation. Currently, it is either the execution of an arbitrary external program (specified via CLI) or the execution of a single Python function (specified via our Python API). What is a job in HQ? Job is a collection of tasks (a task graph). You can display and manage jobs using the CLI. What operating systems does HQ support? HyperQueue currently only officially supports Linux. It might be possible to compile it for other operating systems, however we do not provide any support nor promise to fix any bugs for other operating systems. How to deploy HQ? HQ is distributed as a single, self-contained and statically linked binary. It allows you to start the server, the workers, and it also serves as CLI for submitting and controlling jobs. No other services are needed. How many jobs/tasks may I submit into HQ? Our preliminary benchmarks show that the overhead of HQ is around 0.1 ms per task. It should be thus possible to submit a job with tens or hundreds of thousands tasks into HQ. Note that HQ is designed for a large number of tasks, not jobs. If you want to perform a lot of computations, use task arrays , i.e. create a job with many tasks, not many jobs each with a single task. HQ also supports streaming of task outputs into a single file. This avoids creating many small files for each task on a distributed file system, which improves scaling. Does HQ support multi-CPU tasks? Yes. You can define an arbitrary amount of CPUs for each task. HQ is also NUMA aware and you can select the NUMA allocation strategy. Does HQ support job/task arrays? Yes, see task arrays . Does HQ support tasks with dependencies? Yes, although it is currently only implemented in the Python API, which is experimental. It is currently not possible to specify dependencies using the CLI. How is HQ implemented? HQ is implemented in Rust and uses Tokio ecosystem. The scheduler is work-stealing scheduler implemented in our project Tako , that is derived from our previous work RSDS . Integration tests are written in Python, but HQ itself does not depend on Python. Relation to HPC technologies # Do I need to SLURM or PBS to run HQ? No. Even though HQ is designed to work smoothly on systems using SLURM/PBS, they are not required in order for HQ to work. Is HQ a replacement for SLURM or PBS? Definitely not. Multi-tenancy is out of the scope of HQ, i.e. HQ does not provide user isolation. HQ is light-weight and easy to deploy; on an HPC system each user (or a group of users that trust each other) may run their own instance of HQ. Do I need an HPC cluster to run HQ? No. None of functionality is bound to any HPC technology. Communication between all components is performed using TCP/IP. You can also run HQ locally on your personal computer. Is it safe to run HQ on a login node shared by other users? Yes. All communication is secured and encrypted. The server generates a secret file and only those users that have access to that file may submit jobs and connect workers. Users without access to the secret file will only see that the service is running. Performance should also not be a concern. Our experiments show that the server consumes only ~0.3ms of CPU time every second per a thousand tasks executed. Relation to other task runtimes # How does HQ differ from SnakeMake/Dask/Merlin/...? You can find a comparison of HQ with similar tools here .","title":"FAQ"},{"location":"faq/#faq","text":"Here you can find a list of frequently asked questions about HyperQueue. If you'd like to ask about anything related to HyperQueue, feel free to ask on our discussion forum or on our Zulip server .","title":"FAQ"},{"location":"faq/#hq-fundamentals","text":"How does HQ work? You start a HQ server somewhere (e.g. a login node or a cloud partition of a cluster). Then you can submit your jobs containing tasks to the server. You may have hundreds of thousands of tasks; they may have various CPUs and other resource requirements. Then you can connect any number of HQ workers to the server (either manually or via SLURM/PBS). The server will then immediately start to assign tasks to them. Workers are fully and dynamically controlled by server; you do not need to specify what tasks are executed on a particular worker or preconfigure it in any way. HQ provides a command line tool for submitting and controlling jobs. What is a task in HQ? Task is a unit of computation. Currently, it is either the execution of an arbitrary external program (specified via CLI) or the execution of a single Python function (specified via our Python API). What is a job in HQ? Job is a collection of tasks (a task graph). You can display and manage jobs using the CLI. What operating systems does HQ support? HyperQueue currently only officially supports Linux. It might be possible to compile it for other operating systems, however we do not provide any support nor promise to fix any bugs for other operating systems. How to deploy HQ? HQ is distributed as a single, self-contained and statically linked binary. It allows you to start the server, the workers, and it also serves as CLI for submitting and controlling jobs. No other services are needed. How many jobs/tasks may I submit into HQ? Our preliminary benchmarks show that the overhead of HQ is around 0.1 ms per task. It should be thus possible to submit a job with tens or hundreds of thousands tasks into HQ. Note that HQ is designed for a large number of tasks, not jobs. If you want to perform a lot of computations, use task arrays , i.e. create a job with many tasks, not many jobs each with a single task. HQ also supports streaming of task outputs into a single file. This avoids creating many small files for each task on a distributed file system, which improves scaling. Does HQ support multi-CPU tasks? Yes. You can define an arbitrary amount of CPUs for each task. HQ is also NUMA aware and you can select the NUMA allocation strategy. Does HQ support job/task arrays? Yes, see task arrays . Does HQ support tasks with dependencies? Yes, although it is currently only implemented in the Python API, which is experimental. It is currently not possible to specify dependencies using the CLI. How is HQ implemented? HQ is implemented in Rust and uses Tokio ecosystem. The scheduler is work-stealing scheduler implemented in our project Tako , that is derived from our previous work RSDS . Integration tests are written in Python, but HQ itself does not depend on Python.","title":"HQ fundamentals"},{"location":"faq/#relation-to-hpc-technologies","text":"Do I need to SLURM or PBS to run HQ? No. Even though HQ is designed to work smoothly on systems using SLURM/PBS, they are not required in order for HQ to work. Is HQ a replacement for SLURM or PBS? Definitely not. Multi-tenancy is out of the scope of HQ, i.e. HQ does not provide user isolation. HQ is light-weight and easy to deploy; on an HPC system each user (or a group of users that trust each other) may run their own instance of HQ. Do I need an HPC cluster to run HQ? No. None of functionality is bound to any HPC technology. Communication between all components is performed using TCP/IP. You can also run HQ locally on your personal computer. Is it safe to run HQ on a login node shared by other users? Yes. All communication is secured and encrypted. The server generates a secret file and only those users that have access to that file may submit jobs and connect workers. Users without access to the secret file will only see that the service is running. Performance should also not be a concern. Our experiments show that the server consumes only ~0.3ms of CPU time every second per a thousand tasks executed.","title":"Relation to HPC technologies"},{"location":"faq/#relation-to-other-task-runtimes","text":"How does HQ differ from SnakeMake/Dask/Merlin/...? You can find a comparison of HQ with similar tools here .","title":"Relation to other task runtimes"},{"location":"installation/","text":"Binary distribution (recommended) # The easiest way to install HyperQueue is to download and unpack the prebuilt hq executable: Download the latest release archive from this link . Target architecture Make sure to choose the correct binary for your architecture. Currently, we provide prebuilt binaries for x86-64 and PowerPC architectures. Unpack the downloaded archive: $ tar -xvzf hq--linux-.tar.gz The archive contains a single binary hq , which is used both for deploying the HQ cluster and submitting tasks into HQ . You can add hq to your system $PATH to make its usage easier. See Quickstart for an example \"Hello world\" HyperQueue computation. Compilation from source code # You can also compile HyperQueue from source. This allows you to build HyperQueue for architectures for which we do not provide prebuilt binaries. It can also generate binaries with support for vectorization, which could in theory improve the performance of HyperQueue in extreme cases. Setup a Rust toolchain Clone the HyperQueue repository: $ git clone https://github.com/It4innovations/hyperqueue/ Build HyperQueue: $ RUSTFLAGS = \"-C target-cpu=native\" cargo build --release Jemalloc dependency HyperQueue by default depends on the Jemalloc memory allocator, which is a C library. If you're having problems with installing HyperQueue because of this dependency, you can opt-out of it and use the default system allocator by building HQ with --no-default-features : $ cargo build --release --no-default-features Use the executable located in ./target/release/hq","title":"Installation"},{"location":"installation/#binary-distribution-recommended","text":"The easiest way to install HyperQueue is to download and unpack the prebuilt hq executable: Download the latest release archive from this link . Target architecture Make sure to choose the correct binary for your architecture. Currently, we provide prebuilt binaries for x86-64 and PowerPC architectures. Unpack the downloaded archive: $ tar -xvzf hq--linux-.tar.gz The archive contains a single binary hq , which is used both for deploying the HQ cluster and submitting tasks into HQ . You can add hq to your system $PATH to make its usage easier. See Quickstart for an example \"Hello world\" HyperQueue computation.","title":"Binary distribution (recommended)"},{"location":"installation/#compilation-from-source-code","text":"You can also compile HyperQueue from source. This allows you to build HyperQueue for architectures for which we do not provide prebuilt binaries. It can also generate binaries with support for vectorization, which could in theory improve the performance of HyperQueue in extreme cases. Setup a Rust toolchain Clone the HyperQueue repository: $ git clone https://github.com/It4innovations/hyperqueue/ Build HyperQueue: $ RUSTFLAGS = \"-C target-cpu=native\" cargo build --release Jemalloc dependency HyperQueue by default depends on the Jemalloc memory allocator, which is a C library. If you're having problems with installing HyperQueue because of this dependency, you can opt-out of it and use the default system allocator by building HQ with --no-default-features : $ cargo build --release --no-default-features Use the executable located in ./target/release/hq","title":"Compilation from source code"},{"location":"other-tools/","text":"Comparison with other task runtimes # There is a very large number of different task runtimes, with various performance characteristics, feature sets, programming models and trade-offs, and it is of course infeasible to compare HyperQueue with all of them. One of HyperQueue's authors has written a PhD thesis titled Ergonomics and efficiency of workflows on HPC clusters , which includes a section that compares HyperQueue with several other tools. We invite you to examine this section (and the whole thesis) if you want to find out more about the relation of HyperQueue to other task runtimes. The descriptions of other task runtimes presented on this page are actual as of October 2024. Below you can find a table 1 , which compares selected features of twelve task runtimes that we have experience with and/or that we think are relevant for HyperQueue. You can find more information about the table in Section 7.6 of the thesis. Below we also provide opinionated 2 descriptions of selected task runtimes that we think can be reasonable compared with HyperQueue. GNU Parallel HyperShell Dask Ray Parsl PyCOMPSs Pegasus Balsam AutoSubmit FireWorks SnakeMake Merlin GNU Parallel # GNU Parallel is a command-line utility for executing many tasks in parallel on a set of computational nodes. It does not offer many advanced task runtime features, but it does one thing well; it enables a parallelized and even distributed execution of a set of programs with a single command invocation. HyperQueue takes inspiration from this approach, as it offers a CLI that can be used to execute task graphs with many tasks and complex resource requirements with a single command. HyperShell # HyperShell is primarily designed for executing many homogeneous tasks using the command-line. It does introduce several useful features on top of GNU Parallel , such as automatic task re-execution when a task fails and storing the task state in a database, which enables users to observe the history of executed workflows. It also provides a simple autoscaling functionality that automatically submits allocations. However, tasks in HyperShell are strictly tied to allocations; by default, one task is submitted in a single allocation. It does provide the option to bundle several tasks together, but users have to specify the maximum bundle size explicitly, which makes load balancing inflexible. HyperShell does not support task dependencies; therefore, it cannot be used to execute general task graphs. Dask # Dask is a task runtime that is very popular within the Python community, which allows executing arbitrary task graphs composed of Python functions on a distributed cluster. It also supports distributing code using numpy or pandas compatible API. While Dask by itself does not interact with PBS or Slurm, you can use Dask-JobQueue to make it operate in a similar fashion as HyperQueue - with the centralized server running on a login node and the workers running on compute nodes. Dask does not support arbitrary resource requirements and since it is written in Python, it can have problems with scaling to very large task graphs. If your use-case is primarily Python-based though, you should definitely give Dask a try, it's a great tool. Ray # Ray is a distributed task runtime primarily aimed at parallelizing the training and inference of machine learning models in Python. It uses a relatively unique architecture that leverages distributed scheduling; not all task submission and scheduling decisions need to go through a central location, unlike most other compared task runtimes including HyperQueue. This allows it to scale to an enormous amount of resources, millions of tasks and thousands of nodes. However, in order to enable this level of scalability, the workflow itself has to be implemented in a way where tasks submit new tasks from worker nodes dynamically. Therefore, batch computing use-cases that simply want to execute a predetermined workflow might be unable to achieve such high performance. Same as Dask , it offers basic resource requirements and it also supports fractional resources and related resource groups. However, it does not allow expressing multi-node tasks. In contrast to Dask, it is internally implemented in C++ , which introduces much less overhead than Python. Even though Ray provides some autoscaling functionality, it does not support Slurm or other HPC allocation managers. In general, it is not specialized for HPC idiosyncrasies nor for executing arbitrary task graphs; even though it has a low-level interface for creating tasks through Python functions, it primarily focuses on generating task graphs automatically from high-level descriptions of machine learning pipelines, which are then executed e.g. on cloud resources. Parsl # Parsl is another representative of a Python-oriented task runtime. It allows defining tasks that represent either Python function calls or command-line application invocations using Python. Computational resources in Parsl are configured through a block , a set of preconfigured resources (nodes) designed for executing specific kinds of tasks. In addition to blocks, users also have to specify launchers , which determine how will be each task executed (e.g. using a Slurm or an MPI execution command) and also an executor , which controls how will be tasks scheduled and batched into allocations and if the execution will be fault-tolerant. While these options let users specify how will be their task graph executed on a very granular level, it requires them to tune this configuration per task graph or target cluster; the configuration system is also relatively complex. This is in contrast to HyperQueue, which has a fully general resource management model that does not require users to configure anything; tasks are automatically load balanced across all available workers regardless of allocations and workers do not have to be preconfigured for specific tasks. Parsl has basic support for resource requirements, but does not allow creating custom user-specified resource kinds. It also allows specifying the number of nodes assigned to a task; however, such tasks have to be executed within a single block; Parsl does not allow executing multi-node tasks across different blocks or allocations. PyCOMPSs # PyCOMPSs is a Python interface for executing task graphs on top of the COMPSs distributed system. It allows defining arbitrary task graphs and has comprehensive support for multi-node tasks and basic resource requirements, but it does not allow users to define custom resource requirements. It was extended to support configuration of NUMA nodes for individual tasks. In terms of scheduling, it implements several simple scheduling algorithms; users can select which one should be used. Assignment of tasks to allocations is performed in a manual way; users enqueue a task graph (an application), which is then fully executed once that allocation is started. COMPSs provides basic support for automatic allocation that can dynamically react to computational load. However, it can only add or remove nodes from a primary allocation that is always tied to the execution of a single application; it does not provide fully flexible load balancing. PyCOMPSs is slightly more challenging to deploy than most of the other compared task runtimes, since it also requires a Java runtime environment in addition to a Python interpreter. Pegasus # Pegasus is a very general workflow management system that can execute %workflows on a wide range of clusters, from HPC to cloud. It provides support for various additional features that have not been examined in this thesis, such as data provenance or advanced file management and staging. Its workflows are usually defined using workflow files, which enable specifying dependencies both explicitly or by inferring them from input/output files of tasks. It also supports basic resource requirements, but does not allow defining custom resource kinds nor using multi-node tasks. By default, it maps each task to a single allocation, but it also allows users to cluster tasks together using one of several predefined modes. However, users have to configure this clustering manually; it is not performed fully automatically like in HyperQueue. In terms of deployment, it has the most complex set of runtime dependencies out of the compared task runtimes, as it requires not only a Python interpreter and a Java runtime environment, but also the HTCondor workload management system, which can be non-trivial to install on an HPC cluster. Pegasus delegates some of its functionality to HTCondor; it requires a configured instance of HTCondor before it can execute workflows on a cluster. Balsam # Balsam is a task runtime for executing workflows defined using Python on HPC clusters. It uses a similar fully flexible method for mapping tasks to allocations as HyperQueue, including automatic allocation; however, it is limited to a single allocation queue, similarly as in Dask . It supports multi-node tasks, although users have to statically preconfigure workers to either execute single-node or multi-node tasks. It does not allow specifying custom resource kinds nor more advanced resource management offered by HyperQueue, such as resource variants . The Balsam server requires access to a PostgreSQL database instance, which makes its deployment slightly more challenging than some other tools that do not need a database or that can use an embedded database like SQLite. AutoSubmit # AutoSubmit is a high-level tool for executing workflows and experiments. It focuses primarily on experiment tracking, data provenance and workflow automation. In its default mode, each task corresponds to a single allocation, which is not ideal for short running tasks; AutoSubmit is designed primarily for coarse-grained workflows. It provides a way to bundle multiple tasks into the same allocation using wrappers , but same as with e.g. Pegasus , this has to be preconfigured statically by the user; it is not performed automatically. AutoSubmit does not support custom task resource kinds and it also does not support direct data transfers between tasks nor output streaming. FireWorks # FireWorks is a workflow system for managing the execution of workflows on distributed clusters. It allows defining task graphs using either workflow files or through a Python API. It supports fault-tolerant task execution, although failed tasks have to be re-executed manually. FireWorks does not seem to support any task resource requirements; resources can only be configured for individual allocations. Its meta-scheduling approach is relatively complicated; it provides several ways of mapping tasks to allocations and individual workers with different trade-offs rather than providing a unified way that users would not have to worry about. FireWorks requires a MongoDB database to store tasks, which can make its deployment slightly challenging. SnakeMake # SnakeMake is a popular workflow management system for executing coarse-grained workflows defined using workflow files that can be extended with inline Python code. It can operate both as a meta-scheduler (outside of PBS/Slurm) and also as a classical task runtime within a PBS/Slurm job. Its workflows are based on files; tasks are expected to produce and consume files, which are also used to infer dependencies between them. This can pose an issue with a large number of tasks, as the created files can overload distributed filesystems; no output streaming is offered by the task runtime. It enables assigning both known (e.g. CPU or memory) and custom resource kinds to tasks. It also allows specifying the number of nodes required for each task. With SnakeMake, you can submit a workflow either using a task-per-job model (which has high overhead ) or you can partition the workflow into several jobs, but in that case SnakeMake will not provide load balancing across these partitions, and partitioning the jobs manually can be quite arduous. HyperQueue allows you to submit large workflows without partitioning them manually in any way, as the server will dynamically load balance the tasks onto workers from different PBS/Slurm allocations. Since SnakeMake workflows are defined in configuration files, it's a bit more involved to run computations in SnakeMake than in HyperQueue. On the other hand, SnakeMake lets you define more complex workflows with improved traceability and reproducibility. Merlin # Merlin is a task queueing system that enables execution of large workflows on HPC clusters. It leverages the Celery task queue for distributing tasks to workers and the Maestro workflow specification for defining task graphs. Tasks are submitted into separate Celery queues, whose resources need to be preconfigured; its load balancing is thus not fully flexible and automatic like in HyperQueue. It also does not support automatic allocation and nor does it support custom resource kinds. Failed tasks can be automatically restarted if they end with a specific status code; however, if they fail because of unexpected reasons, users have to mark them for re-execution manually. Merlin requires a message broker backend, such as RabbitMQ or Redis, for its functionality, which makes its deployment non-trivial. It corresponds to Table 7.2 from the PhD thesis. \u21a9 If you think that our description is inaccurate or misleading, please file an issue . \u21a9","title":"Comparison With Other Tools"},{"location":"other-tools/#comparison-with-other-task-runtimes","text":"There is a very large number of different task runtimes, with various performance characteristics, feature sets, programming models and trade-offs, and it is of course infeasible to compare HyperQueue with all of them. One of HyperQueue's authors has written a PhD thesis titled Ergonomics and efficiency of workflows on HPC clusters , which includes a section that compares HyperQueue with several other tools. We invite you to examine this section (and the whole thesis) if you want to find out more about the relation of HyperQueue to other task runtimes. The descriptions of other task runtimes presented on this page are actual as of October 2024. Below you can find a table 1 , which compares selected features of twelve task runtimes that we have experience with and/or that we think are relevant for HyperQueue. You can find more information about the table in Section 7.6 of the thesis. Below we also provide opinionated 2 descriptions of selected task runtimes that we think can be reasonable compared with HyperQueue. GNU Parallel HyperShell Dask Ray Parsl PyCOMPSs Pegasus Balsam AutoSubmit FireWorks SnakeMake Merlin","title":"Comparison with other task runtimes"},{"location":"other-tools/#gnu-parallel","text":"GNU Parallel is a command-line utility for executing many tasks in parallel on a set of computational nodes. It does not offer many advanced task runtime features, but it does one thing well; it enables a parallelized and even distributed execution of a set of programs with a single command invocation. HyperQueue takes inspiration from this approach, as it offers a CLI that can be used to execute task graphs with many tasks and complex resource requirements with a single command.","title":"GNU Parallel"},{"location":"other-tools/#hypershell","text":"HyperShell is primarily designed for executing many homogeneous tasks using the command-line. It does introduce several useful features on top of GNU Parallel , such as automatic task re-execution when a task fails and storing the task state in a database, which enables users to observe the history of executed workflows. It also provides a simple autoscaling functionality that automatically submits allocations. However, tasks in HyperShell are strictly tied to allocations; by default, one task is submitted in a single allocation. It does provide the option to bundle several tasks together, but users have to specify the maximum bundle size explicitly, which makes load balancing inflexible. HyperShell does not support task dependencies; therefore, it cannot be used to execute general task graphs.","title":"HyperShell"},{"location":"other-tools/#dask","text":"Dask is a task runtime that is very popular within the Python community, which allows executing arbitrary task graphs composed of Python functions on a distributed cluster. It also supports distributing code using numpy or pandas compatible API. While Dask by itself does not interact with PBS or Slurm, you can use Dask-JobQueue to make it operate in a similar fashion as HyperQueue - with the centralized server running on a login node and the workers running on compute nodes. Dask does not support arbitrary resource requirements and since it is written in Python, it can have problems with scaling to very large task graphs. If your use-case is primarily Python-based though, you should definitely give Dask a try, it's a great tool.","title":"Dask"},{"location":"other-tools/#ray","text":"Ray is a distributed task runtime primarily aimed at parallelizing the training and inference of machine learning models in Python. It uses a relatively unique architecture that leverages distributed scheduling; not all task submission and scheduling decisions need to go through a central location, unlike most other compared task runtimes including HyperQueue. This allows it to scale to an enormous amount of resources, millions of tasks and thousands of nodes. However, in order to enable this level of scalability, the workflow itself has to be implemented in a way where tasks submit new tasks from worker nodes dynamically. Therefore, batch computing use-cases that simply want to execute a predetermined workflow might be unable to achieve such high performance. Same as Dask , it offers basic resource requirements and it also supports fractional resources and related resource groups. However, it does not allow expressing multi-node tasks. In contrast to Dask, it is internally implemented in C++ , which introduces much less overhead than Python. Even though Ray provides some autoscaling functionality, it does not support Slurm or other HPC allocation managers. In general, it is not specialized for HPC idiosyncrasies nor for executing arbitrary task graphs; even though it has a low-level interface for creating tasks through Python functions, it primarily focuses on generating task graphs automatically from high-level descriptions of machine learning pipelines, which are then executed e.g. on cloud resources.","title":"Ray"},{"location":"other-tools/#parsl","text":"Parsl is another representative of a Python-oriented task runtime. It allows defining tasks that represent either Python function calls or command-line application invocations using Python. Computational resources in Parsl are configured through a block , a set of preconfigured resources (nodes) designed for executing specific kinds of tasks. In addition to blocks, users also have to specify launchers , which determine how will be each task executed (e.g. using a Slurm or an MPI execution command) and also an executor , which controls how will be tasks scheduled and batched into allocations and if the execution will be fault-tolerant. While these options let users specify how will be their task graph executed on a very granular level, it requires them to tune this configuration per task graph or target cluster; the configuration system is also relatively complex. This is in contrast to HyperQueue, which has a fully general resource management model that does not require users to configure anything; tasks are automatically load balanced across all available workers regardless of allocations and workers do not have to be preconfigured for specific tasks. Parsl has basic support for resource requirements, but does not allow creating custom user-specified resource kinds. It also allows specifying the number of nodes assigned to a task; however, such tasks have to be executed within a single block; Parsl does not allow executing multi-node tasks across different blocks or allocations.","title":"Parsl"},{"location":"other-tools/#pycompss","text":"PyCOMPSs is a Python interface for executing task graphs on top of the COMPSs distributed system. It allows defining arbitrary task graphs and has comprehensive support for multi-node tasks and basic resource requirements, but it does not allow users to define custom resource requirements. It was extended to support configuration of NUMA nodes for individual tasks. In terms of scheduling, it implements several simple scheduling algorithms; users can select which one should be used. Assignment of tasks to allocations is performed in a manual way; users enqueue a task graph (an application), which is then fully executed once that allocation is started. COMPSs provides basic support for automatic allocation that can dynamically react to computational load. However, it can only add or remove nodes from a primary allocation that is always tied to the execution of a single application; it does not provide fully flexible load balancing. PyCOMPSs is slightly more challenging to deploy than most of the other compared task runtimes, since it also requires a Java runtime environment in addition to a Python interpreter.","title":"PyCOMPSs"},{"location":"other-tools/#pegasus","text":"Pegasus is a very general workflow management system that can execute %workflows on a wide range of clusters, from HPC to cloud. It provides support for various additional features that have not been examined in this thesis, such as data provenance or advanced file management and staging. Its workflows are usually defined using workflow files, which enable specifying dependencies both explicitly or by inferring them from input/output files of tasks. It also supports basic resource requirements, but does not allow defining custom resource kinds nor using multi-node tasks. By default, it maps each task to a single allocation, but it also allows users to cluster tasks together using one of several predefined modes. However, users have to configure this clustering manually; it is not performed fully automatically like in HyperQueue. In terms of deployment, it has the most complex set of runtime dependencies out of the compared task runtimes, as it requires not only a Python interpreter and a Java runtime environment, but also the HTCondor workload management system, which can be non-trivial to install on an HPC cluster. Pegasus delegates some of its functionality to HTCondor; it requires a configured instance of HTCondor before it can execute workflows on a cluster.","title":"Pegasus"},{"location":"other-tools/#balsam","text":"Balsam is a task runtime for executing workflows defined using Python on HPC clusters. It uses a similar fully flexible method for mapping tasks to allocations as HyperQueue, including automatic allocation; however, it is limited to a single allocation queue, similarly as in Dask . It supports multi-node tasks, although users have to statically preconfigure workers to either execute single-node or multi-node tasks. It does not allow specifying custom resource kinds nor more advanced resource management offered by HyperQueue, such as resource variants . The Balsam server requires access to a PostgreSQL database instance, which makes its deployment slightly more challenging than some other tools that do not need a database or that can use an embedded database like SQLite.","title":"Balsam"},{"location":"other-tools/#autosubmit","text":"AutoSubmit is a high-level tool for executing workflows and experiments. It focuses primarily on experiment tracking, data provenance and workflow automation. In its default mode, each task corresponds to a single allocation, which is not ideal for short running tasks; AutoSubmit is designed primarily for coarse-grained workflows. It provides a way to bundle multiple tasks into the same allocation using wrappers , but same as with e.g. Pegasus , this has to be preconfigured statically by the user; it is not performed automatically. AutoSubmit does not support custom task resource kinds and it also does not support direct data transfers between tasks nor output streaming.","title":"AutoSubmit"},{"location":"other-tools/#fireworks","text":"FireWorks is a workflow system for managing the execution of workflows on distributed clusters. It allows defining task graphs using either workflow files or through a Python API. It supports fault-tolerant task execution, although failed tasks have to be re-executed manually. FireWorks does not seem to support any task resource requirements; resources can only be configured for individual allocations. Its meta-scheduling approach is relatively complicated; it provides several ways of mapping tasks to allocations and individual workers with different trade-offs rather than providing a unified way that users would not have to worry about. FireWorks requires a MongoDB database to store tasks, which can make its deployment slightly challenging.","title":"FireWorks"},{"location":"other-tools/#snakemake","text":"SnakeMake is a popular workflow management system for executing coarse-grained workflows defined using workflow files that can be extended with inline Python code. It can operate both as a meta-scheduler (outside of PBS/Slurm) and also as a classical task runtime within a PBS/Slurm job. Its workflows are based on files; tasks are expected to produce and consume files, which are also used to infer dependencies between them. This can pose an issue with a large number of tasks, as the created files can overload distributed filesystems; no output streaming is offered by the task runtime. It enables assigning both known (e.g. CPU or memory) and custom resource kinds to tasks. It also allows specifying the number of nodes required for each task. With SnakeMake, you can submit a workflow either using a task-per-job model (which has high overhead ) or you can partition the workflow into several jobs, but in that case SnakeMake will not provide load balancing across these partitions, and partitioning the jobs manually can be quite arduous. HyperQueue allows you to submit large workflows without partitioning them manually in any way, as the server will dynamically load balance the tasks onto workers from different PBS/Slurm allocations. Since SnakeMake workflows are defined in configuration files, it's a bit more involved to run computations in SnakeMake than in HyperQueue. On the other hand, SnakeMake lets you define more complex workflows with improved traceability and reproducibility.","title":"SnakeMake"},{"location":"other-tools/#merlin","text":"Merlin is a task queueing system that enables execution of large workflows on HPC clusters. It leverages the Celery task queue for distributing tasks to workers and the Maestro workflow specification for defining task graphs. Tasks are submitted into separate Celery queues, whose resources need to be preconfigured; its load balancing is thus not fully flexible and automatic like in HyperQueue. It also does not support automatic allocation and nor does it support custom resource kinds. Failed tasks can be automatically restarted if they end with a specific status code; however, if they fail because of unexpected reasons, users have to mark them for re-execution manually. Merlin requires a message broker backend, such as RabbitMQ or Redis, for its functionality, which makes its deployment non-trivial. It corresponds to Table 7.2 from the PhD thesis. \u21a9 If you think that our description is inaccurate or misleading, please file an issue . \u21a9","title":"Merlin"},{"location":"quickstart/","text":"Here we provide an example of deploying HyperQueue on a local computer and running a simple \"Hello world\" script. Run each of the following three commands in separate terminals. Start the HyperQueue server $ hq server start The server will manage computing resources (workers) and distribute submitted tasks amongst them. Start a HyperQueue worker $ hq worker start The worker will connect to the server and execute submitted tasks. Submit a simple computation $ hq submit echo \"Hello world\" This command will submit a job with a single task that will execute echo \"Hello world\" on a worker. You can find the output of the task in job-1/0.stdout . That's it! For a more in-depth explanation of how HyperQueue works and what it can do, check the Deployment and Jobs sections.","title":"Quickstart"},{"location":"cli/dashboard/","text":"HyperQueue offers a command-line dashboard that shows information about the state of workers and jobs. It can show which jobs are currently queued or running, which tasks are running on which workers, or what is the current hardware utilization of workers. Warning The dashboard is currently in an experimental stage. Some of its features might not work properly, and important features might be missing. Please let us know if you encounter any issues with it, or if you want us to add new features to it. Dashboard disabled Note that the dashboard has been temporarily disabled in HyperQueue 0.19.0 because of internal architectural changes. We plan to re-enable it in the future. You can start the dashboard using the hq dashboard command: $ hq dashboard The dashboard will try to connect to a running HyperQueue server, and display various information. You can navigate the dashboard using your keyboard. Here is an example video that shows how does the dashboard look like:","title":"Dashboard"},{"location":"cli/output-mode/","text":"By default, HyperQueue CLI commands output information in a human-readable way, usually in the form of a table. If you want to use the CLI commands programmatically, HyperQueue offers two additional output modes that are designed to be machine-readable. You can change the output type of any HyperQueue CLI command either by using the --output-mode flag or by setting the HQ_OUTPUT_MODE environment variable. Flag Environment variable $ hq --output-mode = json job list $ HQ_OUTPUT_MODE = json hq job list Currently, there are three output modes available. The default, human-readable cli mode, and then two machine-readable modes, JSON and Quiet . Important Each machine-readable mode supports a set of commands. You can also use commands that are not listed here, but their output might be unstable, or they might not output anything for a given output mode. JSON # The json output mode is intended to provide very detailed information in the form of a JSON value. With this mode, HyperQueue will always output exactly one JSON value, either an array or an object. Error handling # When an error occurs during the execution of a command, the program will exit with exit code 1 and the program will output a JSON object with a single error key containing a human-readable description of the error. Date formatting # Time-based items are formatted in the following way: Duration - formatted as a floating point number of seconds. Datetime (timestamp) - formatted as a ISO8601 date in UTC Supported commands # Server info: hq server info Example { \"host\" : \"my-machine\" , \"hq_port\" : 42189 , \"pid\" : 32586 , \"server_dir\" : \"/foo/bar/.hq-server\" , \"start_date\" : \"2021-12-20T08:45:41.775753188Z\" , \"version\" : \"0.7.0\" , \"worker_port\" : 38627 } Worker list: hq worker list Example [{ \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 }] Worker info: hq worker info Example { \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 } Submit a job: hq submit Example { \"id\" : 1 } Job list: hq job list Example [{ \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }] Job info: hq job info --tasks Example { \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"info\" : { \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }, \"max_fails\" : null , \"pin\" : null , \"priority\" : 0 , \"program\" : { \"args\" : [ \"ls\" ], \"cwd\" : \"%{SUBMIT_DIR}\" , \"env\" : { \"FOO\" : \"BAR\" }, \"stderr\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stderr\" }, \"stdout\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stdout\" } }, \"started_at\" : \"2021-12-20T08:45:53.458919345Z\" , \"tasks\" : [{ \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"id\" : 0 , \"started_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"state\" : \"finished\" , \"worker\" : 1 , \"cwd\" : \"/tmp/foo\" , \"stderr\" : { \"File\" : \"job-1/0.stderr\" }, \"stdout\" : { \"File\" : \"job-1/0.stdout\" } }], \"time_limit\" : null , \"submit_dir\" : \"/foo/bar/submit\" } Automatic allocation queue list: hq alloc list Example [{ \"additional_args\" : [], \"backlog\" : 4 , \"id\" : 1 , \"manager\" : \"PBS\" , \"max_worker_count\" : null , \"name\" : null , \"timelimit\" : 1800.0 , \"worker_cpu_args\" : null , \"worker_resource_args\" : [], \"workers_per_alloc\" : 1 }] Automatic allocation queue info: hq alloc info Example [{ \"id\" : \"pbs-1\" , \"worker_count\" : 4 , \"queue_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"started_at\" : \"2021-12-20T08:58:25.538001256Z\" , \"ended_at\" : null , \"status\" : \"running\" , \"workdir\" : \"/foo/bar\" }] Automatic allocation queue events: hq alloc events Example [{ \"date\" : \"2021-12-20T08:56:16.437123396Z\" , \"event\" : \"allocation-finished\" , \"params\" : { \"id\" : \"pbs-1\" } }, { \"date\" : \"2021-12-20T08:58:16.437123396Z\" , \"event\" : \"status-fail\" , \"params\" : { \"error\" : \"qstat failed\" } }] Quiet # The quiet output mode will cause HyperQueue to output only the most important information that should be parseable without any complex parsing logic, e.g. using only Bash scripts. Error handling # When an error occurs during the execution of a command, the program will exit with exit code 1 and the error will be printed to the standard error output. Supported commands # Submit a job: hq submit Schema Outputs a single line containing the ID of the created job. Example $ hq --output-mode = quiet submit ls 1","title":"Output mode"},{"location":"cli/output-mode/#json","text":"The json output mode is intended to provide very detailed information in the form of a JSON value. With this mode, HyperQueue will always output exactly one JSON value, either an array or an object.","title":"JSON"},{"location":"cli/output-mode/#error-handling","text":"When an error occurs during the execution of a command, the program will exit with exit code 1 and the program will output a JSON object with a single error key containing a human-readable description of the error.","title":"Error handling"},{"location":"cli/output-mode/#date-formatting","text":"Time-based items are formatted in the following way: Duration - formatted as a floating point number of seconds. Datetime (timestamp) - formatted as a ISO8601 date in UTC","title":"Date formatting"},{"location":"cli/output-mode/#supported-commands","text":"Server info: hq server info Example { \"host\" : \"my-machine\" , \"hq_port\" : 42189 , \"pid\" : 32586 , \"server_dir\" : \"/foo/bar/.hq-server\" , \"start_date\" : \"2021-12-20T08:45:41.775753188Z\" , \"version\" : \"0.7.0\" , \"worker_port\" : 38627 } Worker list: hq worker list Example [{ \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 }] Worker info: hq worker info Example { \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 } Submit a job: hq submit Example { \"id\" : 1 } Job list: hq job list Example [{ \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }] Job info: hq job info --tasks Example { \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"info\" : { \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }, \"max_fails\" : null , \"pin\" : null , \"priority\" : 0 , \"program\" : { \"args\" : [ \"ls\" ], \"cwd\" : \"%{SUBMIT_DIR}\" , \"env\" : { \"FOO\" : \"BAR\" }, \"stderr\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stderr\" }, \"stdout\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stdout\" } }, \"started_at\" : \"2021-12-20T08:45:53.458919345Z\" , \"tasks\" : [{ \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"id\" : 0 , \"started_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"state\" : \"finished\" , \"worker\" : 1 , \"cwd\" : \"/tmp/foo\" , \"stderr\" : { \"File\" : \"job-1/0.stderr\" }, \"stdout\" : { \"File\" : \"job-1/0.stdout\" } }], \"time_limit\" : null , \"submit_dir\" : \"/foo/bar/submit\" } Automatic allocation queue list: hq alloc list Example [{ \"additional_args\" : [], \"backlog\" : 4 , \"id\" : 1 , \"manager\" : \"PBS\" , \"max_worker_count\" : null , \"name\" : null , \"timelimit\" : 1800.0 , \"worker_cpu_args\" : null , \"worker_resource_args\" : [], \"workers_per_alloc\" : 1 }] Automatic allocation queue info: hq alloc info Example [{ \"id\" : \"pbs-1\" , \"worker_count\" : 4 , \"queue_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"started_at\" : \"2021-12-20T08:58:25.538001256Z\" , \"ended_at\" : null , \"status\" : \"running\" , \"workdir\" : \"/foo/bar\" }] Automatic allocation queue events: hq alloc events Example [{ \"date\" : \"2021-12-20T08:56:16.437123396Z\" , \"event\" : \"allocation-finished\" , \"params\" : { \"id\" : \"pbs-1\" } }, { \"date\" : \"2021-12-20T08:58:16.437123396Z\" , \"event\" : \"status-fail\" , \"params\" : { \"error\" : \"qstat failed\" } }]","title":"Supported commands"},{"location":"cli/output-mode/#quiet","text":"The quiet output mode will cause HyperQueue to output only the most important information that should be parseable without any complex parsing logic, e.g. using only Bash scripts.","title":"Quiet"},{"location":"cli/output-mode/#error-handling_1","text":"When an error occurs during the execution of a command, the program will exit with exit code 1 and the error will be printed to the standard error output.","title":"Error handling"},{"location":"cli/output-mode/#supported-commands_1","text":"Submit a job: hq submit Schema Outputs a single line containing the ID of the created job. Example $ hq --output-mode = quiet submit ls 1","title":"Supported commands"},{"location":"cli/shortcuts/","text":"Various HyperQueue CLI command options let you enter some value in a specific syntactical format for convenience. Here you can find a list of such shortcuts. ID selector # When you enter (job/task/worker) IDs to various HyperQueue CLI commands, you can use the following selectors to select multiple IDs at once or to reference the most recently created ID: Single ID hq worker stop 1 - stop a worker with ID 1 hq job cancel 5 - cancel a job with ID 5 -: Inclusive range of IDs, starting at start and ending at end with step step hq submit --array=1-10 - create a task array with 10 tasks hq worker stop 1-3 - stop workers with IDs 1 , 2 and 3 hq job cancel 2-10:2 - cancel jobs with IDs 2 , 4 , 6 , 8 and 10 all All valid IDs hq worker stop all - stop all workers hq job cancel all - cancel all jobs last The most recently created ID hq worker stop last - stop most recently connected worker hq job cancel last - cancel most recently submitted job You can also combine the first two types of selectors with a comma. For example, the command $ hq worker stop 1,3,5-8 would stop workers with IDs 1 , 3 , 5 , 6 , 7 and 8 . Tip You can add underscore ( _ ) separators to any of the entered numeric values to improve readability: $ hq submit --array = 1 -1000_000 ... Supported commands and options # hq submit --array= hq worker stop hq job info does not support all (use hq job list instead) hq job cancel hq job wait hq job progress Duration # You can enter durations using various time suffixes, for example: 1h - one hour 3m - three minutes 14s - fourteen seconds 15days 2min 2s - fifteen days, two minutes and two seconds You can also combine these suffixed values together by separating them with a space. The full specification of allowed suffixed can be found here . Supported commands and options # hq worker start --time-limit= hq worker start --idle-timeout= hq alloc add pbs --time-limit= hq submit --time-limit= ... hq submit --time-request= ... Tip For increased compatibility with PBS and Slurm , you can also specify the --time-limit option of hq alloc add using the HH:MM:SS format.","title":"Shortcuts"},{"location":"cli/shortcuts/#id-selector","text":"When you enter (job/task/worker) IDs to various HyperQueue CLI commands, you can use the following selectors to select multiple IDs at once or to reference the most recently created ID: Single ID hq worker stop 1 - stop a worker with ID 1 hq job cancel 5 - cancel a job with ID 5 -: Inclusive range of IDs, starting at start and ending at end with step step hq submit --array=1-10 - create a task array with 10 tasks hq worker stop 1-3 - stop workers with IDs 1 , 2 and 3 hq job cancel 2-10:2 - cancel jobs with IDs 2 , 4 , 6 , 8 and 10 all All valid IDs hq worker stop all - stop all workers hq job cancel all - cancel all jobs last The most recently created ID hq worker stop last - stop most recently connected worker hq job cancel last - cancel most recently submitted job You can also combine the first two types of selectors with a comma. For example, the command $ hq worker stop 1,3,5-8 would stop workers with IDs 1 , 3 , 5 , 6 , 7 and 8 . Tip You can add underscore ( _ ) separators to any of the entered numeric values to improve readability: $ hq submit --array = 1 -1000_000 ...","title":"ID selector"},{"location":"cli/shortcuts/#supported-commands-and-options","text":"hq submit --array= hq worker stop hq job info does not support all (use hq job list instead) hq job cancel hq job wait hq job progress ","title":"Supported commands and options"},{"location":"cli/shortcuts/#duration","text":"You can enter durations using various time suffixes, for example: 1h - one hour 3m - three minutes 14s - fourteen seconds 15days 2min 2s - fifteen days, two minutes and two seconds You can also combine these suffixed values together by separating them with a space. The full specification of allowed suffixed can be found here .","title":"Duration"},{"location":"cli/shortcuts/#supported-commands-and-options_1","text":"hq worker start --time-limit= hq worker start --idle-timeout= hq alloc add pbs --time-limit= hq submit --time-limit= ... hq submit --time-request= ... Tip For increased compatibility with PBS and Slurm , you can also specify the --time-limit option of hq alloc add using the HH:MM:SS format.","title":"Supported commands and options"},{"location":"deployment/","text":"Architecture # HyperQueue has two runtime components: Server : a long-lived component which can run e.g. on a login node of a computing cluster. It handles task submitted by the user, manages and asks for HPC resources (PBS/Slurm jobs) and distributes tasks to available workers. Worker : runs on a computing node and actually executes submitted tasks. Server and the workers communicate over encrypted TCP/IP channels. The server may run on any machine, as long as the workers are able to connect to it over TCP/IP. Connecting in the other direction (from the server machine to the worker nodes) is not required. A common use-case is to start the server on a login of an HPC system. Learn more about deploying server and the workers .","title":"Architecture"},{"location":"deployment/#architecture","text":"HyperQueue has two runtime components: Server : a long-lived component which can run e.g. on a login node of a computing cluster. It handles task submitted by the user, manages and asks for HPC resources (PBS/Slurm jobs) and distributes tasks to available workers. Worker : runs on a computing node and actually executes submitted tasks. Server and the workers communicate over encrypted TCP/IP channels. The server may run on any machine, as long as the workers are able to connect to it over TCP/IP. Connecting in the other direction (from the server machine to the worker nodes) is not required. A common use-case is to start the server on a login of an HPC system. Learn more about deploying server and the workers .","title":"Architecture"},{"location":"deployment/allocation/","text":"Automatic allocation is one of the core features of HyperQueue. When you run HyperQueue on an HPC cluster, it allows you to autonomously ask the job manager (PBS/Slurm) for computing resources and spawn HyperQueue workers on the provided nodes. Using this mechanism, you can submit computations into HyperQueue without caring about the underlying PBS/Slurm jobs. Job terminology It is common to use the term \"job\" for jobs created by an HPC job manager, such as PBS or Slurm, which are used to perform computations on HPC clusters. However, HyperQueue also uses the term \"job\" for ensembles of tasks . To differentiate between these two, we will refer to jobs created by PBS or Slurm as allocations . We will also refer to PBS/Slurm as a job manager . Allocation queue # To enable automatic allocation, you have to create an Allocation queue . It describes a specific configuration that will be used by HyperQueue to request computing resources from the job manager on your behalf. Each allocation queue has a set of parameters . You can use them to modify the behavior of automatic allocation, but for start you can simply use the defaults. However, you will almost certainly need to specify some credentials to be able to ask for computing resources using PBS/Slurm. To create a new allocation queue, use the following command and pass any required credentials (queue/partition name, account ID, etc.) after -- . These trailing arguments will then be passed directly to qsub / sbatch : PBS Slurm $ hq alloc add pbs --time-limit 1h -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h -- --partition = p1 Tip Make sure that a HyperQueue server is running when you execute this command. Allocation queues are not persistent, so you have to set them up each time you (re)start the server. Warning Do not pass the number of nodes that should be allocated or the allocation walltime using these trailing arguments. These parameters are configured using other means, see below . Once the queue is created, HyperQueue will start asking for allocations in order to provide computing resources (HyperQueue workers). The exact behavior of the automatic allocation process is described below . You can create multiple allocation queues, and you can even combine PBS queues with Slurm queues. Warning Note that the HQ server needs to have access to qsub or sbatch binaries on the node where it is executed. If you want to submit PBS/Slurm allocations on a remote cluster, you will need to use e.g. a proxy to redirect the commands to that cluster. See this issue for more information. If you have a use-case for such remote PBS/Slurm allocation submission, please let us know , as we could try to make that easier in HyperQueue if there was enough interest in it. Parameters # In addition to arguments that are passed to qsub / sbatch , you can also use several other command line options when creating a new allocation queue: Time limit # Format 1 : --time-limit Sets the walltime of created allocations. This parameter is required , as HyperQueue must know the duration of the individual allocations. Make sure that you pass a time limit that does not exceed the limit of the PBS/Slurm queue that you intend to use, otherwise the allocation submissions will fail. You can use the dry-run command to debug this. Workers in this allocation queue will be by default created with a time limit equal to the time limit of the queue (unless overridden with Worker time limit ). Important If you specify a time request for a task, you should be aware that the time limit for the allocation queue should be larger than the time request if you want to run this task on workers created by this allocations queue, because it will always take some time before a worker is fully initialized. For example, if you set --time-request 1h when submitting a task, and --time-limit 1h when creating an allocation queue, this task will never get scheduled on workers from this queue. Backlog # Format: --backlog How many allocations should be queued (waiting to be started) in PBS/Slurm at any given time. Has to be a positive integer. Workers per allocation # Format: --workers-per-alloc How many workers should be requested in each allocation. This corresponds to the number of requested nodes, as the allocator will always create a single worker per node. Max worker count # Format: --max-worker-count Maximum number of workers that can be queued or running in the allocation queue. The total amount of workers will be usually limited by the manager (PBS/Slurm), but you can use this parameter to make the limit smaller, for example if you also want to create manager allocations outside HyperQueue. Worker resources # You can specify CPU and generic resources of workers spawned by the allocation queue. The name and syntax of these parameters is the same as when you create a worker manually : PBS Slurm $ hq alloc add pbs --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- --partition = p1 If you do not pass any resources, they will be detected automatically (same as it works with hq worker start ). Idle timeout # Format 1 : --idle-timeout Sets the idle timeout for workers started by the allocation queue. We suggest that you do not use a long duration for this parameter, as it can result in wasting precious allocation time. Worker start command # Format: --worker-start-cmd Specifies a shell command that will be executed on each allocated node just before a worker is started on that node. You can use it e.g. to initialize some shared environment for the node, or to load software modules. Worker stop command # Format: --worker-stop-cmd Specifies a shell command that will be executed on each allocated node just after the worker stops on that node. You can use it e.g. to clean up a previously initialized environment for the node. Warning The execution of this command is best-effort! It is not guaranteed that the command will always be executed. For example, PBS/Slurm can kill the allocation without giving HQ a chance to run the command. Worker time limit # Format 1 : --worker-time-limit Sets the time limit of workers spawned by the allocation queue. After the time limit expires, the worker will be stopped. By default, the worker time limit is set to the time limit of the allocation queue. But if you want, you can shorten it with this flag to make the worker exit sooner, for example to give more time for a worker stop command to execute. Note This command is not designed to stop workers early if they have nothing to do. This functionality is provided by idle timeout . Name # Format: --name Name of the allocation queue. It will be used to name allocations submitted to the job manager. Serves for debug purposes only. Behavior # The automatic allocator will submit allocations to make sure that there are is a specific number of allocations waiting to be started by the job manager. This number is called backlog and you can set it when creating the queue. For example, if backlog was set to 4 and there is currently only one allocation queued into the job manager, the allocator would queue three more allocations. The backlog serves to pre-queue allocations, because it can take some time before the job manager starts them, and also as a load balancing factor, since it will allocate as many resources as the job manager allows. Note The backlog value does not limit the number of running allocations, only the number of queued allocations. Warning Do not set the backlog to a large number to avoid overloading the job manager. When an allocation starts, a HyperQueue worker will start and connect to the HyperQueue server that queued the allocation. The worker has the idle timeout set to five minutes, therefore it will terminate if it doesn't receive any new tasks for five minutes. Stopping automatic allocation # If you want to remove an allocation queue, use the following command: $ hq alloc remove When an allocation queue is removed, all its corresponding queued and running allocations will be canceled immediately. By default, HQ will not allow you to remove an allocation queue that contains a running allocation. If you want to force its removal, use the --force flag. When the HQ server stops, it will automatically remove all allocation queues and cleanup all allocations. Debugging automatic allocation # Since the automatic allocator is a \"background\" process that interacts with an external job manager, it can be challenging to debug its behavior. To aid with this process, HyperQueue provides a \"dry-run\" command that you can use to test allocation parameters. HyperQueue also provides various sources of information that can help you find out what is going on. To mitigate the case of incorrectly entered allocation parameters, HQ will also try to submit a test allocation (do a \"dry run\") into the target HPC job manager when you add a new allocation queue. If the test allocation fails, the queue will not be created. You can avoid this behaviour by passing the --no-dry-run flag to hq alloc add . There are also additional safety limits. If 10 allocations in a succession fail to be submitted, or if 3 allocations that were submitted fail during runtime in a succession, the corresponding allocation queue will be automatically removed. Dry-run command # To test whether PBS/Slurm will accept the submit parameters that you provide to the auto allocator without creating an allocation queue, you can use the dry-run command. It accepts the same parameters as hq alloc add , which it will use to immediately submit an allocation and print any encountered errors. $ hq alloc dry-run pbs --timelimit 2h -- q qexp -A Project1 If the allocation was submitted successfully, it will be canceled immediately to avoid wasting resources. Finding information about allocations # Basic queue information This command will show you details about allocations created by the automatic allocator. Extended logging To get more information about what is happening inside the allocator, start the HyperQueue server with the following environment variable: $ RUST_LOG = hyperqueue::server::autoalloc = debug hq server start The log output of the server will then contain a detailed trace of allocator actions. Allocation files Each time the allocator queues an allocation into the job manager, it will write the submitted bash script, allocation ID and stdout and stderr of the allocation to disk. You can find these files inside the server directory: $ ls /hq-current/autoalloc/// stderr stdout job-id hq-submit.sh Useful autoalloc commands # Here is a list of useful commands to manage automatic allocation: Display a list of all allocation queues # $ hq alloc list Display information about an allocation queue # $ hq alloc info You can filter allocations by their state ( queued , running , finished , failed ) using the --filter option. You can use various shortcuts for the duration value. \u21a9 \u21a9 \u21a9","title":"Automatic Allocation"},{"location":"deployment/allocation/#allocation-queue","text":"To enable automatic allocation, you have to create an Allocation queue . It describes a specific configuration that will be used by HyperQueue to request computing resources from the job manager on your behalf. Each allocation queue has a set of parameters . You can use them to modify the behavior of automatic allocation, but for start you can simply use the defaults. However, you will almost certainly need to specify some credentials to be able to ask for computing resources using PBS/Slurm. To create a new allocation queue, use the following command and pass any required credentials (queue/partition name, account ID, etc.) after -- . These trailing arguments will then be passed directly to qsub / sbatch : PBS Slurm $ hq alloc add pbs --time-limit 1h -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h -- --partition = p1 Tip Make sure that a HyperQueue server is running when you execute this command. Allocation queues are not persistent, so you have to set them up each time you (re)start the server. Warning Do not pass the number of nodes that should be allocated or the allocation walltime using these trailing arguments. These parameters are configured using other means, see below . Once the queue is created, HyperQueue will start asking for allocations in order to provide computing resources (HyperQueue workers). The exact behavior of the automatic allocation process is described below . You can create multiple allocation queues, and you can even combine PBS queues with Slurm queues. Warning Note that the HQ server needs to have access to qsub or sbatch binaries on the node where it is executed. If you want to submit PBS/Slurm allocations on a remote cluster, you will need to use e.g. a proxy to redirect the commands to that cluster. See this issue for more information. If you have a use-case for such remote PBS/Slurm allocation submission, please let us know , as we could try to make that easier in HyperQueue if there was enough interest in it.","title":"Allocation queue"},{"location":"deployment/allocation/#parameters","text":"In addition to arguments that are passed to qsub / sbatch , you can also use several other command line options when creating a new allocation queue:","title":"Parameters"},{"location":"deployment/allocation/#time-limit","text":"Format 1 : --time-limit Sets the walltime of created allocations. This parameter is required , as HyperQueue must know the duration of the individual allocations. Make sure that you pass a time limit that does not exceed the limit of the PBS/Slurm queue that you intend to use, otherwise the allocation submissions will fail. You can use the dry-run command to debug this. Workers in this allocation queue will be by default created with a time limit equal to the time limit of the queue (unless overridden with Worker time limit ). Important If you specify a time request for a task, you should be aware that the time limit for the allocation queue should be larger than the time request if you want to run this task on workers created by this allocations queue, because it will always take some time before a worker is fully initialized. For example, if you set --time-request 1h when submitting a task, and --time-limit 1h when creating an allocation queue, this task will never get scheduled on workers from this queue.","title":"Time limit"},{"location":"deployment/allocation/#backlog","text":"Format: --backlog How many allocations should be queued (waiting to be started) in PBS/Slurm at any given time. Has to be a positive integer.","title":"Backlog"},{"location":"deployment/allocation/#workers-per-allocation","text":"Format: --workers-per-alloc How many workers should be requested in each allocation. This corresponds to the number of requested nodes, as the allocator will always create a single worker per node.","title":"Workers per allocation"},{"location":"deployment/allocation/#max-worker-count","text":"Format: --max-worker-count Maximum number of workers that can be queued or running in the allocation queue. The total amount of workers will be usually limited by the manager (PBS/Slurm), but you can use this parameter to make the limit smaller, for example if you also want to create manager allocations outside HyperQueue.","title":"Max worker count"},{"location":"deployment/allocation/#worker-resources","text":"You can specify CPU and generic resources of workers spawned by the allocation queue. The name and syntax of these parameters is the same as when you create a worker manually : PBS Slurm $ hq alloc add pbs --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- --partition = p1 If you do not pass any resources, they will be detected automatically (same as it works with hq worker start ).","title":"Worker resources"},{"location":"deployment/allocation/#idle-timeout","text":"Format 1 : --idle-timeout Sets the idle timeout for workers started by the allocation queue. We suggest that you do not use a long duration for this parameter, as it can result in wasting precious allocation time.","title":"Idle timeout"},{"location":"deployment/allocation/#worker-start-command","text":"Format: --worker-start-cmd Specifies a shell command that will be executed on each allocated node just before a worker is started on that node. You can use it e.g. to initialize some shared environment for the node, or to load software modules.","title":"Worker start command"},{"location":"deployment/allocation/#worker-stop-command","text":"Format: --worker-stop-cmd Specifies a shell command that will be executed on each allocated node just after the worker stops on that node. You can use it e.g. to clean up a previously initialized environment for the node. Warning The execution of this command is best-effort! It is not guaranteed that the command will always be executed. For example, PBS/Slurm can kill the allocation without giving HQ a chance to run the command.","title":"Worker stop command"},{"location":"deployment/allocation/#worker-time-limit","text":"Format 1 : --worker-time-limit Sets the time limit of workers spawned by the allocation queue. After the time limit expires, the worker will be stopped. By default, the worker time limit is set to the time limit of the allocation queue. But if you want, you can shorten it with this flag to make the worker exit sooner, for example to give more time for a worker stop command to execute. Note This command is not designed to stop workers early if they have nothing to do. This functionality is provided by idle timeout .","title":"Worker time limit"},{"location":"deployment/allocation/#name","text":"Format: --name Name of the allocation queue. It will be used to name allocations submitted to the job manager. Serves for debug purposes only.","title":"Name"},{"location":"deployment/allocation/#behavior","text":"The automatic allocator will submit allocations to make sure that there are is a specific number of allocations waiting to be started by the job manager. This number is called backlog and you can set it when creating the queue. For example, if backlog was set to 4 and there is currently only one allocation queued into the job manager, the allocator would queue three more allocations. The backlog serves to pre-queue allocations, because it can take some time before the job manager starts them, and also as a load balancing factor, since it will allocate as many resources as the job manager allows. Note The backlog value does not limit the number of running allocations, only the number of queued allocations. Warning Do not set the backlog to a large number to avoid overloading the job manager. When an allocation starts, a HyperQueue worker will start and connect to the HyperQueue server that queued the allocation. The worker has the idle timeout set to five minutes, therefore it will terminate if it doesn't receive any new tasks for five minutes.","title":"Behavior"},{"location":"deployment/allocation/#stopping-automatic-allocation","text":"If you want to remove an allocation queue, use the following command: $ hq alloc remove When an allocation queue is removed, all its corresponding queued and running allocations will be canceled immediately. By default, HQ will not allow you to remove an allocation queue that contains a running allocation. If you want to force its removal, use the --force flag. When the HQ server stops, it will automatically remove all allocation queues and cleanup all allocations.","title":"Stopping automatic allocation"},{"location":"deployment/allocation/#debugging-automatic-allocation","text":"Since the automatic allocator is a \"background\" process that interacts with an external job manager, it can be challenging to debug its behavior. To aid with this process, HyperQueue provides a \"dry-run\" command that you can use to test allocation parameters. HyperQueue also provides various sources of information that can help you find out what is going on. To mitigate the case of incorrectly entered allocation parameters, HQ will also try to submit a test allocation (do a \"dry run\") into the target HPC job manager when you add a new allocation queue. If the test allocation fails, the queue will not be created. You can avoid this behaviour by passing the --no-dry-run flag to hq alloc add . There are also additional safety limits. If 10 allocations in a succession fail to be submitted, or if 3 allocations that were submitted fail during runtime in a succession, the corresponding allocation queue will be automatically removed.","title":"Debugging automatic allocation"},{"location":"deployment/allocation/#dry-run-command","text":"To test whether PBS/Slurm will accept the submit parameters that you provide to the auto allocator without creating an allocation queue, you can use the dry-run command. It accepts the same parameters as hq alloc add , which it will use to immediately submit an allocation and print any encountered errors. $ hq alloc dry-run pbs --timelimit 2h -- q qexp -A Project1 If the allocation was submitted successfully, it will be canceled immediately to avoid wasting resources.","title":"Dry-run command"},{"location":"deployment/allocation/#finding-information-about-allocations","text":"Basic queue information This command will show you details about allocations created by the automatic allocator. Extended logging To get more information about what is happening inside the allocator, start the HyperQueue server with the following environment variable: $ RUST_LOG = hyperqueue::server::autoalloc = debug hq server start The log output of the server will then contain a detailed trace of allocator actions. Allocation files Each time the allocator queues an allocation into the job manager, it will write the submitted bash script, allocation ID and stdout and stderr of the allocation to disk. You can find these files inside the server directory: $ ls /hq-current/autoalloc/// stderr stdout job-id hq-submit.sh","title":"Finding information about allocations"},{"location":"deployment/allocation/#useful-autoalloc-commands","text":"Here is a list of useful commands to manage automatic allocation:","title":"Useful autoalloc commands"},{"location":"deployment/allocation/#display-a-list-of-all-allocation-queues","text":"$ hq alloc list","title":"Display a list of all allocation queues"},{"location":"deployment/allocation/#display-information-about-an-allocation-queue","text":"$ hq alloc info You can filter allocations by their state ( queued , running , finished , failed ) using the --filter option. You can use various shortcuts for the duration value. \u21a9 \u21a9 \u21a9","title":"Display information about an allocation queue"},{"location":"deployment/cloud/","text":"Starting HQ without shared file system # On system without shared file system, all what is needed is to distribute access file ( access.json ) to clients and workers. This file contains address and port where server is running and secret keys. By default, client and worker search for access.json in $HOME/.hq-server . Generate access file in advance # In many cases you, we want to generate an access file in advance before any server is started; moreover, we do not want to regenerate secret keys in every start of server, because we do not want to redistribute access when server is restarted. To solve this, an access file can be generated in advance by command \"generate-access\", e.g.: $ hq server generate-access myaccess.json --client-port=6789 --worker-port=1234 This generates myaccess.json that contains generates keys and host information. The server can be later started with this configuration as follows: $ hq server start --access-file=myaccess.json Note: That server still generates and manages \"own\" access.json in the server directory path. For connecting clients and workers you can use both, myaccess.json or newly generated access.json , they are same. Example of starting a worker from myaccess.json $ mv myaccess.json /mydirectory/access.json $ hq --server-dir=/mydirectory worker start Splitting access for client and workers # Access file contains two secret keys and two points to connect, for clients and for workers. This information can be divided into two separate files, containing only information needed only by clients or only by workers. $ hq server generate-access full.json --client-file=client.json --worker-file=worker.json --client-port=6789 --worker-port=1234 This command creates three files: full.json , client.json , worker.json . For starting a client you can use client.json as access.json while it does not contain information for workers. For starting a worker you can use worker.json as access.json while it does not contain information for clients. For starting server ( hq server start --access-file=... ) you have to use full.json as it contains all necessary information. Setting different server hostname for workers and clients # You can use the following command to configure different hostnames under which the server is visible to workers and clients. hq server generate-access full.json --worker-host= --client-host= ...","title":"Without Shared Filesystem"},{"location":"deployment/cloud/#starting-hq-without-shared-file-system","text":"On system without shared file system, all what is needed is to distribute access file ( access.json ) to clients and workers. This file contains address and port where server is running and secret keys. By default, client and worker search for access.json in $HOME/.hq-server .","title":"Starting HQ without shared file system"},{"location":"deployment/cloud/#generate-access-file-in-advance","text":"In many cases you, we want to generate an access file in advance before any server is started; moreover, we do not want to regenerate secret keys in every start of server, because we do not want to redistribute access when server is restarted. To solve this, an access file can be generated in advance by command \"generate-access\", e.g.: $ hq server generate-access myaccess.json --client-port=6789 --worker-port=1234 This generates myaccess.json that contains generates keys and host information. The server can be later started with this configuration as follows: $ hq server start --access-file=myaccess.json Note: That server still generates and manages \"own\" access.json in the server directory path. For connecting clients and workers you can use both, myaccess.json or newly generated access.json , they are same. Example of starting a worker from myaccess.json $ mv myaccess.json /mydirectory/access.json $ hq --server-dir=/mydirectory worker start","title":"Generate access file in advance"},{"location":"deployment/cloud/#splitting-access-for-client-and-workers","text":"Access file contains two secret keys and two points to connect, for clients and for workers. This information can be divided into two separate files, containing only information needed only by clients or only by workers. $ hq server generate-access full.json --client-file=client.json --worker-file=worker.json --client-port=6789 --worker-port=1234 This command creates three files: full.json , client.json , worker.json . For starting a client you can use client.json as access.json while it does not contain information for workers. For starting a worker you can use worker.json as access.json while it does not contain information for clients. For starting server ( hq server start --access-file=... ) you have to use full.json as it contains all necessary information.","title":"Splitting access for client and workers"},{"location":"deployment/cloud/#setting-different-server-hostname-for-workers-and-clients","text":"You can use the following command to configure different hostnames under which the server is visible to workers and clients. hq server generate-access full.json --worker-host= --client-host= ...","title":"Setting different server hostname for workers and clients"},{"location":"deployment/server/","text":"The server is a crucial component of HyperQueue which manages workers and jobs . Before running any computations or deploying workers, you must first start the server. Starting the server # The server can be started by running the following command: $ hq server start You can change the hostname under which the server is visible to workers with the --host option: $ hq server start --host = HOST Server directory # When the server is started, it creates a server directory where it stores information needed for submitting jobs and connecting workers . This directory is then used to select a running HyperQueue instance. By default, the server directory will be stored in $HOME/.hq-server . This location may be changed with the option --server-dir= , which is available for all HyperQueue CLI commands. You can run more instances of HyperQueue under the same Unix user, by making them use different server directories. If you use a non-default server directory, make sure to pass the same --server-dir to all HyperQueue commands that should use the selected HyperQueue server: $ hq --server-dir = foo server start & $ hq --server-dir = foo worker start Tip To avoid having to pass the --server-dir parameter to all hq commands separately, you can also pass it through the HQ_SERVER_DIR environment variable, and export it to share it for all commands in the same terminal session: $ export HQ_SERVER_DIR = bar $ hq server start & $ hq worker start & Important When you start the server, it will create a new subdirectory in the server directory, which will store the data of the current running instance. It will also create a symlink hq-current which will point to the currently active subdirectory. Using this approach, you can start a server using the same server directory multiple times without overwriting data of the previous runs. Server directory access Encryption keys are stored in the server directory. Whoever has access to the server directory may submit jobs, connect workers to the server and decrypt communication between HyperQueue components. By default, the directory is only accessible by the user who started the server. Keeping the server alive # The server is supposed to be a long-lived component. If you shut it down, all workers will disconnect and all computations will be stopped. Therefore, it is important to make sure that the server will stay running e.g. even after you disconnect from a cluster where the server is deployed. For example, if you SSH into a login node of an HPC cluster and then run the server like this: $ hq server start The server will quit when your SSH session ends, because it will receive a SIGHUP signal. You can use established Unix approaches to avoid this behavior, for example prepending the command with nohup or using a terminal multiplexer like tmux . Resuming stopped/crashed server # The server supports resilience, which allows it to restore its state after it is stopped or if it crashes. To enable resilience, you can tell the server to log events into a journal file, using the --journal flag: $ hq server start --journal /path/to/journal If the server is stopped or it crashes, and you use the same command to start the server (using the same journal file path), it will continue from the last point: $ hq server start --journal /path/to/journal This functionality restores the state of jobs and automatic allocation queues. However, it does not restore worker connections; in the current version, new workers have to be connected to the server after it restarts. Warning If the server crashes, the last few seconds of progress may be lost. For example, when a task is finished and the server crashes before the journal is written, then after resuming the server, the task will be not be computed after a server restart. Exporting journal events # If you'd like to programmatically analyze events that are stored in the journal file, you can export them to JSON using the following command: $ hq journal export The events will be read from the provided journal and printed to stdout encoded in JSON, one event per line (this corresponds to line-delimited JSON, i.e. NDJSON ). You can also directly stream events in real-time from the server using the following command: $ hq journal stream Warning The JSON format of the journal events and their definition is currently unstable and can change with a new HyperQueue version. Stopping server # You can stop a running server with the following command: $ hq server stop When a server is stopped, all running jobs and connected workers will be immediately stopped.","title":"Server"},{"location":"deployment/server/#starting-the-server","text":"The server can be started by running the following command: $ hq server start You can change the hostname under which the server is visible to workers with the --host option: $ hq server start --host = HOST","title":"Starting the server"},{"location":"deployment/server/#server-directory","text":"When the server is started, it creates a server directory where it stores information needed for submitting jobs and connecting workers . This directory is then used to select a running HyperQueue instance. By default, the server directory will be stored in $HOME/.hq-server . This location may be changed with the option --server-dir= , which is available for all HyperQueue CLI commands. You can run more instances of HyperQueue under the same Unix user, by making them use different server directories. If you use a non-default server directory, make sure to pass the same --server-dir to all HyperQueue commands that should use the selected HyperQueue server: $ hq --server-dir = foo server start & $ hq --server-dir = foo worker start Tip To avoid having to pass the --server-dir parameter to all hq commands separately, you can also pass it through the HQ_SERVER_DIR environment variable, and export it to share it for all commands in the same terminal session: $ export HQ_SERVER_DIR = bar $ hq server start & $ hq worker start & Important When you start the server, it will create a new subdirectory in the server directory, which will store the data of the current running instance. It will also create a symlink hq-current which will point to the currently active subdirectory. Using this approach, you can start a server using the same server directory multiple times without overwriting data of the previous runs. Server directory access Encryption keys are stored in the server directory. Whoever has access to the server directory may submit jobs, connect workers to the server and decrypt communication between HyperQueue components. By default, the directory is only accessible by the user who started the server.","title":"Server directory"},{"location":"deployment/server/#keeping-the-server-alive","text":"The server is supposed to be a long-lived component. If you shut it down, all workers will disconnect and all computations will be stopped. Therefore, it is important to make sure that the server will stay running e.g. even after you disconnect from a cluster where the server is deployed. For example, if you SSH into a login node of an HPC cluster and then run the server like this: $ hq server start The server will quit when your SSH session ends, because it will receive a SIGHUP signal. You can use established Unix approaches to avoid this behavior, for example prepending the command with nohup or using a terminal multiplexer like tmux .","title":"Keeping the server alive"},{"location":"deployment/server/#resuming-stoppedcrashed-server","text":"The server supports resilience, which allows it to restore its state after it is stopped or if it crashes. To enable resilience, you can tell the server to log events into a journal file, using the --journal flag: $ hq server start --journal /path/to/journal If the server is stopped or it crashes, and you use the same command to start the server (using the same journal file path), it will continue from the last point: $ hq server start --journal /path/to/journal This functionality restores the state of jobs and automatic allocation queues. However, it does not restore worker connections; in the current version, new workers have to be connected to the server after it restarts. Warning If the server crashes, the last few seconds of progress may be lost. For example, when a task is finished and the server crashes before the journal is written, then after resuming the server, the task will be not be computed after a server restart.","title":"Resuming stopped/crashed server"},{"location":"deployment/server/#exporting-journal-events","text":"If you'd like to programmatically analyze events that are stored in the journal file, you can export them to JSON using the following command: $ hq journal export The events will be read from the provided journal and printed to stdout encoded in JSON, one event per line (this corresponds to line-delimited JSON, i.e. NDJSON ). You can also directly stream events in real-time from the server using the following command: $ hq journal stream Warning The JSON format of the journal events and their definition is currently unstable and can change with a new HyperQueue version.","title":"Exporting journal events"},{"location":"deployment/server/#stopping-server","text":"You can stop a running server with the following command: $ hq server stop When a server is stopped, all running jobs and connected workers will be immediately stopped.","title":"Stopping server"},{"location":"deployment/worker/","text":"Workers connect to a running instance of a HyperQueue server and wait for task assignments. Once some task is assigned to them, they will compute it and notify the server of its completion. Starting workers # Workers should be started on machines that will actually execute the submitted computations, e.g. computing nodes on an HPC cluster. You can either use the automatic allocation system of HyperQueue to start workers as needed, or deploy workers manually. Automatic worker deployment (recommended) # If you are using a job manager (PBS or Slurm) on an HPC cluster, the easiest way of deploying workers is to use Automatic allocation . It is a component of HyperQueue that takes care of submitting PBS/Slurm jobs and spawning HyperQueue workers. Manual worker deployment # If you want to start a worker manually, you can use the following command: $ hq worker start Each worker will be assigned a unique ID that you can use in later commands to query information about the worker or to stop it. By default, the worker will try to connect to a server using the default server directory . If you want to connect to a different server, use the --server-dir option. Sharing the server directory When you start a worker, it will need to read the server directory to find out how to connect to the server. The directory thus has to be accesible both by the server and the worker machines. On HPC clusters, it is common that login nodes and compute nodes use a shared filesystem, so this shouldn't be a problem. However, if a shared filesystem is not available on your cluster, you can just copy the server directory from the server machine to the worker machine and access it from there. The worker machine still has to be able to initiate a TCP/IP connection to the server machine though. Deploying a worker using PBS/Slurm # If you want to manually start a worker using PBS or Slurm, simply use the corresponding submit command ( qsub or sbatch ) and run the hq worker start command inside the allocated job. If you want to start a worker on each allocated node, you can run this command on each node using e.g. mpirun . Example submission script: PBS Slurm #!/bin/bash #PBS -q # Run a worker on the main node //hq worker start --manager pbs # Run a worker on all allocated nodes ml OpenMPI pbsdsh //hq worker start --manager pbs #!/bin/bash #SBATCH --partition # Run a worker on the main node //hq worker start --manager slurm # Run a worker on all allocated nodes ml OpenMPI srun --overlap //hq worker start --manager slurm The worker will try to automatically detect that it is started under a PBS/Slurm job, but you can also explicitly pass the option --manager to tell the worker that it should expect a specific environment. Stopping workers # If you have started a worker manually, and you want to stop it, you can use the hq worker stop command 1 : $ hq worker stop Time limit # HyperQueue workers are designed to be volatile, i.e. it is expected that they will be stopped from time to time, because they are often started inside PBS/Slurm jobs that have a limited duration. It is very useful for the workers to know how much remaining time (\"lifetime\") do they have until they will be stopped. This duration is called the Worker time limit . When a worker is started manually inside a PBS or Slurm job, it will automatically calculate the time limit from the job's metadata. If you want to set time limit for workers started outside of PBS/Slurm jobs or if you want to override the detected settings, you can use the --time-limit= option 2 when starting the worker. When the time limit is reached, the worker is automatically terminated. The time limit of a worker affects what tasks can be scheduled to it. For example, a task submitted with --time-request 10m will not be scheduled onto a worker that only has a remaining time limit of 5 minutes. Idle timeout # When you deploy HQ workers inside a PBS or Slurm job, keeping the worker alive will drain resources from your accounting project (unless you use a free queue). If a worker has nothing to do, it might be better to terminate it sooner to avoid paying these costs for no reason. You can achieve this using Worker idle timeout . If you use it, the worker will automatically stop if it receives no task to compute for the specified duration. For example, if you set the idle duration to five minutes, the worker will stop once it hadn't received any task to compute for five minutes. You can set the idle timeout using the --idle-timeout option 2 when starting the worker. Tip Workers started automatically have the idle timeout set to five minutes. Idle timeout can also be configured globally for all workers using the --idle-timeout option when starting a server: $ hq server start --idle-timeout = This value will be then used for each worker that does not explicitly specify its own idle timeout. Worker state # Each worker can be in one of the following states: Running Worker is running and is able to process tasks Connection lost Worker lost connection to the server. Probably someone manually killed the worker or job walltime in its PBS/Slurm job was reached . Heartbeat lost Communication between server and worker was interrupted. It usually signifies a network problem or a hardware crash of the computational node. Stopped Worker was stopped . Idle timeout Worker was terminated due to Idle timeout . Lost connection to the server # The behavior of what should happen with a worker that lost its connection to the server is configured via hq worker start --on-server-lost= . You can select from two policies: stop - The worker immediately terminates and kills all currently running tasks. finish-running - The worker does not start to execute any new tasks, but it tries to finish tasks that are already running. When all such tasks finish, the worker will terminate. stop is the default policy when a worker is manually started by hq worker start . When a worker is started by the automatic allocator , then finish-running is used as the default value. Useful worker commands # Here is a list of useful worker commands: Display worker list # This command will display a list of workers that are currently connected to the server: $ hq worker list If you also want to include workers that are offline (i.e. that have crashed or disconnected in the past), pass the --all flag to the list command. Display information about a specific worker # $ hq worker info Worker groups # Each worker is a member exactly of one group. Groups are used when multi-node tasks are used. See more here You can use various shortcuts to select multiple workers at once. \u21a9 You can use various shortcuts for the duration value. \u21a9 \u21a9","title":"Workers"},{"location":"deployment/worker/#starting-workers","text":"Workers should be started on machines that will actually execute the submitted computations, e.g. computing nodes on an HPC cluster. You can either use the automatic allocation system of HyperQueue to start workers as needed, or deploy workers manually.","title":"Starting workers"},{"location":"deployment/worker/#automatic-worker-deployment-recommended","text":"If you are using a job manager (PBS or Slurm) on an HPC cluster, the easiest way of deploying workers is to use Automatic allocation . It is a component of HyperQueue that takes care of submitting PBS/Slurm jobs and spawning HyperQueue workers.","title":"Automatic worker deployment (recommended)"},{"location":"deployment/worker/#manual-worker-deployment","text":"If you want to start a worker manually, you can use the following command: $ hq worker start Each worker will be assigned a unique ID that you can use in later commands to query information about the worker or to stop it. By default, the worker will try to connect to a server using the default server directory . If you want to connect to a different server, use the --server-dir option. Sharing the server directory When you start a worker, it will need to read the server directory to find out how to connect to the server. The directory thus has to be accesible both by the server and the worker machines. On HPC clusters, it is common that login nodes and compute nodes use a shared filesystem, so this shouldn't be a problem. However, if a shared filesystem is not available on your cluster, you can just copy the server directory from the server machine to the worker machine and access it from there. The worker machine still has to be able to initiate a TCP/IP connection to the server machine though.","title":"Manual worker deployment"},{"location":"deployment/worker/#deploying-a-worker-using-pbsslurm","text":"If you want to manually start a worker using PBS or Slurm, simply use the corresponding submit command ( qsub or sbatch ) and run the hq worker start command inside the allocated job. If you want to start a worker on each allocated node, you can run this command on each node using e.g. mpirun . Example submission script: PBS Slurm #!/bin/bash #PBS -q # Run a worker on the main node //hq worker start --manager pbs # Run a worker on all allocated nodes ml OpenMPI pbsdsh //hq worker start --manager pbs #!/bin/bash #SBATCH --partition # Run a worker on the main node //hq worker start --manager slurm # Run a worker on all allocated nodes ml OpenMPI srun --overlap //hq worker start --manager slurm The worker will try to automatically detect that it is started under a PBS/Slurm job, but you can also explicitly pass the option --manager to tell the worker that it should expect a specific environment.","title":"Deploying a worker using PBS/Slurm"},{"location":"deployment/worker/#stopping-workers","text":"If you have started a worker manually, and you want to stop it, you can use the hq worker stop command 1 : $ hq worker stop ","title":"Stopping workers"},{"location":"deployment/worker/#time-limit","text":"HyperQueue workers are designed to be volatile, i.e. it is expected that they will be stopped from time to time, because they are often started inside PBS/Slurm jobs that have a limited duration. It is very useful for the workers to know how much remaining time (\"lifetime\") do they have until they will be stopped. This duration is called the Worker time limit . When a worker is started manually inside a PBS or Slurm job, it will automatically calculate the time limit from the job's metadata. If you want to set time limit for workers started outside of PBS/Slurm jobs or if you want to override the detected settings, you can use the --time-limit= option 2 when starting the worker. When the time limit is reached, the worker is automatically terminated. The time limit of a worker affects what tasks can be scheduled to it. For example, a task submitted with --time-request 10m will not be scheduled onto a worker that only has a remaining time limit of 5 minutes.","title":"Time limit"},{"location":"deployment/worker/#idle-timeout","text":"When you deploy HQ workers inside a PBS or Slurm job, keeping the worker alive will drain resources from your accounting project (unless you use a free queue). If a worker has nothing to do, it might be better to terminate it sooner to avoid paying these costs for no reason. You can achieve this using Worker idle timeout . If you use it, the worker will automatically stop if it receives no task to compute for the specified duration. For example, if you set the idle duration to five minutes, the worker will stop once it hadn't received any task to compute for five minutes. You can set the idle timeout using the --idle-timeout option 2 when starting the worker. Tip Workers started automatically have the idle timeout set to five minutes. Idle timeout can also be configured globally for all workers using the --idle-timeout option when starting a server: $ hq server start --idle-timeout = This value will be then used for each worker that does not explicitly specify its own idle timeout.","title":"Idle timeout"},{"location":"deployment/worker/#worker-state","text":"Each worker can be in one of the following states: Running Worker is running and is able to process tasks Connection lost Worker lost connection to the server. Probably someone manually killed the worker or job walltime in its PBS/Slurm job was reached . Heartbeat lost Communication between server and worker was interrupted. It usually signifies a network problem or a hardware crash of the computational node. Stopped Worker was stopped . Idle timeout Worker was terminated due to Idle timeout .","title":"Worker state"},{"location":"deployment/worker/#lost-connection-to-the-server","text":"The behavior of what should happen with a worker that lost its connection to the server is configured via hq worker start --on-server-lost= . You can select from two policies: stop - The worker immediately terminates and kills all currently running tasks. finish-running - The worker does not start to execute any new tasks, but it tries to finish tasks that are already running. When all such tasks finish, the worker will terminate. stop is the default policy when a worker is manually started by hq worker start . When a worker is started by the automatic allocator , then finish-running is used as the default value.","title":"Lost connection to the server"},{"location":"deployment/worker/#useful-worker-commands","text":"Here is a list of useful worker commands:","title":"Useful worker commands"},{"location":"deployment/worker/#display-worker-list","text":"This command will display a list of workers that are currently connected to the server: $ hq worker list If you also want to include workers that are offline (i.e. that have crashed or disconnected in the past), pass the --all flag to the list command.","title":"Display worker list"},{"location":"deployment/worker/#display-information-about-a-specific-worker","text":"$ hq worker info ","title":"Display information about a specific worker"},{"location":"deployment/worker/#worker-groups","text":"Each worker is a member exactly of one group. Groups are used when multi-node tasks are used. See more here You can use various shortcuts to select multiple workers at once. \u21a9 You can use various shortcuts for the duration value. \u21a9 \u21a9","title":"Worker groups"},{"location":"examples/","text":"Examples # Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line interface and also with the Python API. You can view these examples either in the documentation or on GitHub . Iterative computation","title":"Examples"},{"location":"examples/#examples","text":"Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line interface and also with the Python API. You can view these examples either in the documentation or on GitHub . Iterative computation","title":"Examples"},{"location":"examples/iterative-computation/","text":"Iterative computation # It is a common use-case to perform an iterative computation, e.g. run a randomized simulation until the results are stable/accurate enough, or train a machine learning model while the loss keeps dropping. While there is currently no built-in support in HQ for iteratively submitting new tasks to an existing job, you can perform an iterative computation relatively easily with the following approach: Submit a HQ job that performs a computation Wait for the job to finish Read the output of the job and decide if computation should continue If yes, go to 1. Python API # With the Python API, we can simply write the outermost iteration loop in Python, and repeatedly submit jobs, until some end criterion has been achieved: from hyperqueue import Job , Client client = Client () while True : job = Job () job . program ([ \"my-program\" ], stdout = \"out.txt\" ) # Submit a job submitted = client . submit ( job ) # Wait for it to complete client . wait_for_jobs ([ submitted ]) # Read the output of the job with open ( \"out.txt\" ) as f : # Check some termination condition and eventually end the loop if f . read () . strip () == \"done\" : break Command-line interface # With the command-line interface, you can perform the iterative loop e.g. in Bash. #!/bin/bash while : do # Submit a job and wait for it to complete ./hq submit --wait ./compute.sh # Read the output of the job output = $( ./hq job cat last stdout ) # Decide if we should end or continue if [ \" ${ output } \" -eq 0 ] ; then break fi done","title":"Iterative computation"},{"location":"examples/iterative-computation/#iterative-computation","text":"It is a common use-case to perform an iterative computation, e.g. run a randomized simulation until the results are stable/accurate enough, or train a machine learning model while the loss keeps dropping. While there is currently no built-in support in HQ for iteratively submitting new tasks to an existing job, you can perform an iterative computation relatively easily with the following approach: Submit a HQ job that performs a computation Wait for the job to finish Read the output of the job and decide if computation should continue If yes, go to 1.","title":"Iterative computation"},{"location":"examples/iterative-computation/#python-api","text":"With the Python API, we can simply write the outermost iteration loop in Python, and repeatedly submit jobs, until some end criterion has been achieved: from hyperqueue import Job , Client client = Client () while True : job = Job () job . program ([ \"my-program\" ], stdout = \"out.txt\" ) # Submit a job submitted = client . submit ( job ) # Wait for it to complete client . wait_for_jobs ([ submitted ]) # Read the output of the job with open ( \"out.txt\" ) as f : # Check some termination condition and eventually end the loop if f . read () . strip () == \"done\" : break","title":"Python API"},{"location":"examples/iterative-computation/#command-line-interface","text":"With the command-line interface, you can perform the iterative loop e.g. in Bash. #!/bin/bash while : do # Submit a job and wait for it to complete ./hq submit --wait ./compute.sh # Read the output of the job output = $( ./hq job cat last stdout ) # Decide if we should end or continue if [ \" ${ output } \" -eq 0 ] ; then break fi done","title":"Command-line interface"},{"location":"jobs/arrays/","text":"It is a common use case to execute the same command for multiple input parameters, for example: Perform a simulation for each input file in a directory or for each line in a CSV file. Train many machine learning models using hyperparameter search for each model configuration. HyperQueue allows you to do this using a job that contains many tasks. We call such jobs Task arrays . You can create a task array with a single submit command and then manage all created tasks as a single group using its containing job. Note Task arrays are somewhat similar to \"job arrays\" used by PBS and Slurm. However, HQ does not use PBS/Slurm job arrays for implementing this feature. Therefore, the limits that are commonly enforced on job arrays on HPC clusters do not apply to HyperQueue task arrays. Creating task arrays # To create a task array, you must provide some source that will determine how many tasks should be created and what inputs (environment variables) should be passed to each task so that you can differentiate them. Currently, you can create a task array from a range of integers , from each line of a text file or from each item of a JSON array . You cannot combine these sources, as they are mutually exclusive. Handling many output files By default, each task in a task array will create two output files (containing stdout and stderr output). Creating large task arrays will thus generate a lot of files, which can be problematic especially on network-based shared filesystems, such as Lustre. To avoid this, you can either disable the output or use Output streaming . Integer range # The simplest way of creating a task array is to specify an integer range. A task will be started for each integer in the range. You can then differentiate between the individual tasks using task id that can be accessed through the HQ_TASK_ID environment variable . You can enter the range as two unsigned numbers separated by a dash 1 , where the first number should be smaller than the second one. The range is inclusive. The range is entered using the --array option: # Task array with 3 tasks, with ids 1, 2, 3 $ hq submit --array 1 -3 ... # Task array with 6 tasks, with ids 0, 2, 4, 6, 8, 10 $ hq submit --array 0 -10:2 ... Lines of a file # Another way of creating a task array is to provide a text file with multiple lines. Each line from the file will be passed to a separate task, which can access the value of the line using the environment variable HQ_ENTRY . This is useful if you want to e.g. process each file inside some directory. You can generate a text file that will contain each filepath on a separate line and then pass it to the submit command using the --each-line option: $ hq submit --each-line entries.txt ... Tip To directly use an environment variable in the submitted command, you have to make sure that it will be expanded when the command is executed, not when the command is submitted. You should also execute the command in a bash script if you want to specify it directly and not via a script file. For example, the following command is incorrect , as it will expand HQ_ENTRY during submission (probably to an empty string) and submit a command ls : $ hq submit --each-line files.txt ls $HQ_ENTRY To actually submit the command ls $HQ_ENTRY , you can e.g. wrap the command in apostrophes and run it in a shell: $ hq submit --each-line files.txt bash -c 'ls $HQ_ENTRY' JSON array # You can also specify the source using a JSON array stored inside a file. HyperQueue will then create a task for each item in the array and pass the item as a JSON string to the corresponding task using the environment variable HQ_ENTRY . Note The root JSON value stored inside the file must be an array. You can create a task array in this way using the --from-json option: $ hq submit --from-json items.json ... If items.json contained this content: [{ \"batch_size\" : 4 , \"learning_rate\" : 0.01 }, { \"batch_size\" : 8 , \"learning_rate\" : 0.001 }] then HyperQueue would create two tasks, one with HQ_ENTRY set to {\"batch_size\": 4, \"learning_rate\": 0.01} and the other with HQ_ENTRY set to {\"batch_size\": 8, \"learning_rate\": 0.001} . Combining with --each-line / --from-json with --array # Option --each-line or --from-json can be combined with option --array . In such case, only a subset of lines/json will be submitted. If --array defines an ID that exceeds the number of lines in the file (or the number of elements in JSON), then the ID is silently removed. For example: $ hq submit --each-line input.txt --array \"2, 8-10\" If input.txt has sufficiently many lines then it will create array job with four tasks. One for 3rd line of file and three tasks for 9th-11th line (note that first line has id 0). It analogously works for --from-json . The full syntax can be seen in the second selector of the ID selector shortcut . \u21a9","title":"Task Arrays"},{"location":"jobs/arrays/#creating-task-arrays","text":"To create a task array, you must provide some source that will determine how many tasks should be created and what inputs (environment variables) should be passed to each task so that you can differentiate them. Currently, you can create a task array from a range of integers , from each line of a text file or from each item of a JSON array . You cannot combine these sources, as they are mutually exclusive. Handling many output files By default, each task in a task array will create two output files (containing stdout and stderr output). Creating large task arrays will thus generate a lot of files, which can be problematic especially on network-based shared filesystems, such as Lustre. To avoid this, you can either disable the output or use Output streaming .","title":"Creating task arrays"},{"location":"jobs/arrays/#integer-range","text":"The simplest way of creating a task array is to specify an integer range. A task will be started for each integer in the range. You can then differentiate between the individual tasks using task id that can be accessed through the HQ_TASK_ID environment variable . You can enter the range as two unsigned numbers separated by a dash 1 , where the first number should be smaller than the second one. The range is inclusive. The range is entered using the --array option: # Task array with 3 tasks, with ids 1, 2, 3 $ hq submit --array 1 -3 ... # Task array with 6 tasks, with ids 0, 2, 4, 6, 8, 10 $ hq submit --array 0 -10:2 ...","title":"Integer range"},{"location":"jobs/arrays/#lines-of-a-file","text":"Another way of creating a task array is to provide a text file with multiple lines. Each line from the file will be passed to a separate task, which can access the value of the line using the environment variable HQ_ENTRY . This is useful if you want to e.g. process each file inside some directory. You can generate a text file that will contain each filepath on a separate line and then pass it to the submit command using the --each-line option: $ hq submit --each-line entries.txt ... Tip To directly use an environment variable in the submitted command, you have to make sure that it will be expanded when the command is executed, not when the command is submitted. You should also execute the command in a bash script if you want to specify it directly and not via a script file. For example, the following command is incorrect , as it will expand HQ_ENTRY during submission (probably to an empty string) and submit a command ls : $ hq submit --each-line files.txt ls $HQ_ENTRY To actually submit the command ls $HQ_ENTRY , you can e.g. wrap the command in apostrophes and run it in a shell: $ hq submit --each-line files.txt bash -c 'ls $HQ_ENTRY'","title":"Lines of a file"},{"location":"jobs/arrays/#json-array","text":"You can also specify the source using a JSON array stored inside a file. HyperQueue will then create a task for each item in the array and pass the item as a JSON string to the corresponding task using the environment variable HQ_ENTRY . Note The root JSON value stored inside the file must be an array. You can create a task array in this way using the --from-json option: $ hq submit --from-json items.json ... If items.json contained this content: [{ \"batch_size\" : 4 , \"learning_rate\" : 0.01 }, { \"batch_size\" : 8 , \"learning_rate\" : 0.001 }] then HyperQueue would create two tasks, one with HQ_ENTRY set to {\"batch_size\": 4, \"learning_rate\": 0.01} and the other with HQ_ENTRY set to {\"batch_size\": 8, \"learning_rate\": 0.001} .","title":"JSON array"},{"location":"jobs/arrays/#combining-with-each-line-from-json-with-array","text":"Option --each-line or --from-json can be combined with option --array . In such case, only a subset of lines/json will be submitted. If --array defines an ID that exceeds the number of lines in the file (or the number of elements in JSON), then the ID is silently removed. For example: $ hq submit --each-line input.txt --array \"2, 8-10\" If input.txt has sufficiently many lines then it will create array job with four tasks. One for 3rd line of file and three tasks for 9th-11th line (note that first line has id 0). It analogously works for --from-json . The full syntax can be seen in the second selector of the ID selector shortcut . \u21a9","title":"Combining with --each-line/--from-json with --array"},{"location":"jobs/cresources/","text":"CPU resource management # Note In this text, we use the term CPU for a resource that is provided by the operating system (e.g. what you get from /proc/cpuinfo ). In this meaning, it is usually a core of a physical CPU. In the text related to NUMA we use the term socket to refer to physical CPUs. Brief introduction # HyperQueue allows you to select how many CPU cores will be allocated for each task. By default, each task requires a single CPU of the worker's node. This can be changed by the flag --cpus . For example, to submit a job with a task that requires 8 CPUs: $ hq submit --cpus = 8 This ensures that HyperQueue will exclusively reserve 8 CPUs for this task when it is started. This task would thus never be scheduled on a worker that has less than 8 CPUs. Note that this reservation exists on a logical level only. To ensure more direct mapping to physical cores, see pinning below. CPUs are a resource # From version 0.13.0, CPUs are managed as any other resource under name \"cpus\", with the following additions: If a task does not explicitly specify the number of cpus, then it requests 1 CPU as default. CPUs request can be specified by hq submit --cpus=X ... where --cpus=X is a shortcut for --resource cpus=X , and X can be all valid requests for a resource, including values like all or 8 compact! . (More in Resource Management ). A task may be automatically pinned to a given CPUs (see pinning ). There are some extra environmental variables for CPUs (see below). CPUs are automatically detected. See below for information about NUMA or Hyper Threading. CPUs provided by a worker can be explicitly specified via --cpus , see below. CPU related environment variables # The following variables are created when a task is executed: HQ_CPUS - List of cores assigned to a task. (this is an alias for HQ_RESOURCE_VALUES_cpus ). HQ_PIN - Is set to taskset or omp (depending on the used pin mode) if the task was pinned by HyperQueue (see below). NUM_OMP_THREADS -- Set to number of cores assigned for task. (For compatibility with OpenMP). This option is not set when you ask for a non-integer number of CPUs. Pinning # By default, HQ internally allocates CPUs on a logical level. In other words, HQ ensures that the sum of requests of concurrently running tasks does not exceed the number of CPUs of the worker, but process assignment to cores is left to the system scheduler, which may move processes across CPUs as it wants. If this is not desired, especially in the case of NUMA, processes could be pinned, either manually or automatically. Automatic pinning # HyperQueue can pin threads using two ways: with taskset or by setting OpenMP environment variables. You can use the --pin flag to choose between these two modes. taskset OpenMP $ hq submit --pin taskset --cpus = 8 will cause HyperQueue to execute your program like this: taskset -c \"\" ` $ hq submit --pin omp --cpus = 8 will cause HyperQueue to execute your program like this: OMP_PROC_BIND = close OMP_PLACES = \"{}\" If any automatic pinning mode is enabled, the environment variable HQ_PIN will be set. Manual pinning # If you want to gain full control over core pinning, you may pin the process by yourself. The assigned CPUs are stored in the environment variable HQ_CPUS as a comma-delimited list of CPU IDs. You can use utilities such as taskset or numactl and pass them HQ_CPUS to pin a process to these CPUs. Warning If you manually pin your processes, do not also use the --pin flag of the submit command. It may have some unwanted interferences. Below you can find an example of a script file that pins the executed process manually using taskset and numactl : taskset numactl #!/bin/bash taskset -c $HQ_CPUS #!/bin/bash numactl -C $HQ_CPUS If you submit this script with hq submit --cpus=4 script.sh , it will pin your program to 4 CPUs allocated by HQ. NUMA allocation strategy # Workers automatically detect the number of CPUs and on Linux systems they also detect their partitioning into sockets. When a NUMA architecture is automatically detected, indexed resource with groups is used for resource \"cpus\". You can then use allocation strategies for groups to specify how sockets are allocated. They follow the same rules as normal allocation strategies; for clarity we are rephrasing the group allocation strategies in terms of cores and sockets: Compact ( compact ) - Tries to allocate cores on as few sockets as possible in the current worker state. $ hq submit --cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocates cores on as few sockets as possible for a target node. The task will not be executed until the requirement could be fully fulfilled. For example, if your worker has 4 cores per socket, and you ask for 4 CPUs, it will always be executed on a single socket. If you ask for 8 CPUs, it will always be executed on two sockets. $ hq submit --cpus = \"8 compact!\" ... Tip You might encounter a problem in your shell when you try to specify the strict compact policy, because the definition contains an exclamation mark ( ! ). In that case, try to wrap the policy in single quotes, like this: $ hq submit --cpus = '8 compact!' ... Scatter ( scatter ) - Allocate cores across as many sockets possible, based on the currently available cores of a worker. If your worker has 4 sockets with 8 cores per socket, and you ask for 8 CPUs, then HQ will try to run the process with 2 CPUs on each socket, if possible given the currently available worker cores. $ hq submit --cpus = \"8 scatter\" ... The default policy is the compact policy, i.e. --cpus= is equivalent to --cpus=\" compact\" . Note Specifying a policy only has an effect if you have more than one socket (physical CPUs). In case of a single socket, policies are indistinguishable. CPU configuration # Each worker will automatically detect the number of CPUs available. On Linux systems, it will also detect the partitioning into sockets (NUMA configuration). In most cases, it should work out of the box. If you want to see how will a HQ worker see your CPU configuration without actually starting the worker, you can use the hq worker hwdetect command, which will print the detected CPU configuration. Manual specification of CPU configuration # If the automatic detection fails for some reason, or you want to manually configure the CPU configuration, you can use the --cpus flag when starting a worker. It is an alias for --resource cpus=... (More in Resource Management ), except it also allow to define --cpus=N where N is an integer; it is then interpreted as 1xN in the resource definition. Below there are some examples of configuration that you can specify: Worker with 8 CPUs and a single socket. $ hq worker start --cpus = 8 Worker with 2 sockets with 12 cores per socket. $ hq worker start --cpus = 2x12 Manually specify that the worker should use the following core ids and how they are organized into sockets. In this example, two sockets are defined, one with 3 cores and one with 2 cores. $ hq worker start --cpus =[[ 2 , 3 , 4 ] , [ 10 , 14 ]] Disable Hyper Threading # If you want to detect CPUs but ignore HyperThreading then --no-hyper-threading flag can be used. It will detect only the first virtual core of each physical core. Example: $ hq worker start --no-hyper-threading","title":"CPU Resources"},{"location":"jobs/cresources/#cpu-resource-management","text":"Note In this text, we use the term CPU for a resource that is provided by the operating system (e.g. what you get from /proc/cpuinfo ). In this meaning, it is usually a core of a physical CPU. In the text related to NUMA we use the term socket to refer to physical CPUs.","title":"CPU resource management"},{"location":"jobs/cresources/#brief-introduction","text":"HyperQueue allows you to select how many CPU cores will be allocated for each task. By default, each task requires a single CPU of the worker's node. This can be changed by the flag --cpus . For example, to submit a job with a task that requires 8 CPUs: $ hq submit --cpus = 8 This ensures that HyperQueue will exclusively reserve 8 CPUs for this task when it is started. This task would thus never be scheduled on a worker that has less than 8 CPUs. Note that this reservation exists on a logical level only. To ensure more direct mapping to physical cores, see pinning below.","title":"Brief introduction"},{"location":"jobs/cresources/#cpus-are-a-resource","text":"From version 0.13.0, CPUs are managed as any other resource under name \"cpus\", with the following additions: If a task does not explicitly specify the number of cpus, then it requests 1 CPU as default. CPUs request can be specified by hq submit --cpus=X ... where --cpus=X is a shortcut for --resource cpus=X , and X can be all valid requests for a resource, including values like all or 8 compact! . (More in Resource Management ). A task may be automatically pinned to a given CPUs (see pinning ). There are some extra environmental variables for CPUs (see below). CPUs are automatically detected. See below for information about NUMA or Hyper Threading. CPUs provided by a worker can be explicitly specified via --cpus , see below.","title":"CPUs are a resource"},{"location":"jobs/cresources/#cpu-related-environment-variables","text":"The following variables are created when a task is executed: HQ_CPUS - List of cores assigned to a task. (this is an alias for HQ_RESOURCE_VALUES_cpus ). HQ_PIN - Is set to taskset or omp (depending on the used pin mode) if the task was pinned by HyperQueue (see below). NUM_OMP_THREADS -- Set to number of cores assigned for task. (For compatibility with OpenMP). This option is not set when you ask for a non-integer number of CPUs.","title":"CPU related environment variables"},{"location":"jobs/cresources/#pinning","text":"By default, HQ internally allocates CPUs on a logical level. In other words, HQ ensures that the sum of requests of concurrently running tasks does not exceed the number of CPUs of the worker, but process assignment to cores is left to the system scheduler, which may move processes across CPUs as it wants. If this is not desired, especially in the case of NUMA, processes could be pinned, either manually or automatically.","title":"Pinning"},{"location":"jobs/cresources/#automatic-pinning","text":"HyperQueue can pin threads using two ways: with taskset or by setting OpenMP environment variables. You can use the --pin flag to choose between these two modes. taskset OpenMP $ hq submit --pin taskset --cpus = 8 will cause HyperQueue to execute your program like this: taskset -c \"\" ` $ hq submit --pin omp --cpus = 8 will cause HyperQueue to execute your program like this: OMP_PROC_BIND = close OMP_PLACES = \"{}\" If any automatic pinning mode is enabled, the environment variable HQ_PIN will be set.","title":"Automatic pinning"},{"location":"jobs/cresources/#manual-pinning","text":"If you want to gain full control over core pinning, you may pin the process by yourself. The assigned CPUs are stored in the environment variable HQ_CPUS as a comma-delimited list of CPU IDs. You can use utilities such as taskset or numactl and pass them HQ_CPUS to pin a process to these CPUs. Warning If you manually pin your processes, do not also use the --pin flag of the submit command. It may have some unwanted interferences. Below you can find an example of a script file that pins the executed process manually using taskset and numactl : taskset numactl #!/bin/bash taskset -c $HQ_CPUS #!/bin/bash numactl -C $HQ_CPUS If you submit this script with hq submit --cpus=4 script.sh , it will pin your program to 4 CPUs allocated by HQ.","title":"Manual pinning"},{"location":"jobs/cresources/#numa-allocation-strategy","text":"Workers automatically detect the number of CPUs and on Linux systems they also detect their partitioning into sockets. When a NUMA architecture is automatically detected, indexed resource with groups is used for resource \"cpus\". You can then use allocation strategies for groups to specify how sockets are allocated. They follow the same rules as normal allocation strategies; for clarity we are rephrasing the group allocation strategies in terms of cores and sockets: Compact ( compact ) - Tries to allocate cores on as few sockets as possible in the current worker state. $ hq submit --cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocates cores on as few sockets as possible for a target node. The task will not be executed until the requirement could be fully fulfilled. For example, if your worker has 4 cores per socket, and you ask for 4 CPUs, it will always be executed on a single socket. If you ask for 8 CPUs, it will always be executed on two sockets. $ hq submit --cpus = \"8 compact!\" ... Tip You might encounter a problem in your shell when you try to specify the strict compact policy, because the definition contains an exclamation mark ( ! ). In that case, try to wrap the policy in single quotes, like this: $ hq submit --cpus = '8 compact!' ... Scatter ( scatter ) - Allocate cores across as many sockets possible, based on the currently available cores of a worker. If your worker has 4 sockets with 8 cores per socket, and you ask for 8 CPUs, then HQ will try to run the process with 2 CPUs on each socket, if possible given the currently available worker cores. $ hq submit --cpus = \"8 scatter\" ... The default policy is the compact policy, i.e. --cpus= is equivalent to --cpus=\" compact\" . Note Specifying a policy only has an effect if you have more than one socket (physical CPUs). In case of a single socket, policies are indistinguishable.","title":"NUMA allocation strategy"},{"location":"jobs/cresources/#cpu-configuration","text":"Each worker will automatically detect the number of CPUs available. On Linux systems, it will also detect the partitioning into sockets (NUMA configuration). In most cases, it should work out of the box. If you want to see how will a HQ worker see your CPU configuration without actually starting the worker, you can use the hq worker hwdetect command, which will print the detected CPU configuration.","title":"CPU configuration"},{"location":"jobs/cresources/#manual-specification-of-cpu-configuration","text":"If the automatic detection fails for some reason, or you want to manually configure the CPU configuration, you can use the --cpus flag when starting a worker. It is an alias for --resource cpus=... (More in Resource Management ), except it also allow to define --cpus=N where N is an integer; it is then interpreted as 1xN in the resource definition. Below there are some examples of configuration that you can specify: Worker with 8 CPUs and a single socket. $ hq worker start --cpus = 8 Worker with 2 sockets with 12 cores per socket. $ hq worker start --cpus = 2x12 Manually specify that the worker should use the following core ids and how they are organized into sockets. In this example, two sockets are defined, one with 3 cores and one with 2 cores. $ hq worker start --cpus =[[ 2 , 3 , 4 ] , [ 10 , 14 ]]","title":"Manual specification of CPU configuration"},{"location":"jobs/cresources/#disable-hyper-threading","text":"If you want to detect CPUs but ignore HyperThreading then --no-hyper-threading flag can be used. It will detect only the first virtual core of each physical core. Example: $ hq worker start --no-hyper-threading","title":"Disable Hyper Threading"},{"location":"jobs/directives/","text":"Directives # You can specify job parameters using special comments ( directives ) specified in a submitted shell script. Directives are lines that begin with the #HQ prefix. Any text following this prefix will be interpreted as a command line argument for hq submit . Example directive file # Suppose that script.sh has the following content: #!/bin/bash #HQ --name=Example #HQ --cpus=\"2 compact\" --pin taskset ./my-program If you execute $ hq submit script.sh it will behave as if you have executed $ hq submit --name = Example --cpus = \"2 compact\" --pin taskset script.sh Directives mode # You can select three modes using the --directives flag of hq submit . The mode will determine when should HyperQueue attempt to parse directives from the provided command. auto (default) - Directives will be parsed if the first command passed to hq submit has the .sh extension. file - Directives will be parsed from the first command passed to hq submit . stdin - Directives will be parsed from stdin (see --stdin ) off - Directives will not be parsed. Tip When HQ parses directives from a file, it will also try to parse a shebang line from the script and use it to select an interpreter for running the script. Notes # Directives have to be defined at the beginning of the file. Only comments or empty lines are allowed to precede the directives. Directives have to be defined in the first 32KiB of the file, the rest of the file is ignored. Parameters set via CLI have precedence over parameters set via direectives: Parameters that cannot occur multiple times (like --name ) will be overriden by values set from CLI. Parameters that can occur multiple times (like --resource ) will be combined from CLI and from directives. A script may contain more lines with the #HQ prefix, such lines are combined and evaluated as a continuous list of parameters.","title":"Directives"},{"location":"jobs/directives/#directives","text":"You can specify job parameters using special comments ( directives ) specified in a submitted shell script. Directives are lines that begin with the #HQ prefix. Any text following this prefix will be interpreted as a command line argument for hq submit .","title":"Directives"},{"location":"jobs/directives/#example-directive-file","text":"Suppose that script.sh has the following content: #!/bin/bash #HQ --name=Example #HQ --cpus=\"2 compact\" --pin taskset ./my-program If you execute $ hq submit script.sh it will behave as if you have executed $ hq submit --name = Example --cpus = \"2 compact\" --pin taskset script.sh","title":"Example directive file"},{"location":"jobs/directives/#directives-mode","text":"You can select three modes using the --directives flag of hq submit . The mode will determine when should HyperQueue attempt to parse directives from the provided command. auto (default) - Directives will be parsed if the first command passed to hq submit has the .sh extension. file - Directives will be parsed from the first command passed to hq submit . stdin - Directives will be parsed from stdin (see --stdin ) off - Directives will not be parsed. Tip When HQ parses directives from a file, it will also try to parse a shebang line from the script and use it to select an interpreter for running the script.","title":"Directives mode"},{"location":"jobs/directives/#notes","text":"Directives have to be defined at the beginning of the file. Only comments or empty lines are allowed to precede the directives. Directives have to be defined in the first 32KiB of the file, the rest of the file is ignored. Parameters set via CLI have precedence over parameters set via direectives: Parameters that cannot occur multiple times (like --name ) will be overriden by values set from CLI. Parameters that can occur multiple times (like --resource ) will be combined from CLI and from directives. A script may contain more lines with the #HQ prefix, such lines are combined and evaluated as a continuous list of parameters.","title":"Notes"},{"location":"jobs/failure/","text":"In distributed systems, failure is inevitable. This sections describes how HyperQueue handles various types of failures and how can you affect its behavior. Resubmitting array jobs # When a job fails or is canceled, you can submit it again. However, in case of task arrays , different tasks may end in different states, and often we want to recompute only tasks with a specific status (e.g. failed tasks). By following combination of commands you may recompute only failed tasks. Let us assume that we want to recompute all failed tasks in job 5: $ hq submit --array=`hq job task-ids 5 --filter=failed` ./my-computation It works as follows: Command hq job task-ids 5 --filter=failed returns IDs of failed jobs of job 5 , and we set it to --array parameter that starts only tasks for given IDs. If we want to recompute all failed tasks and all canceled tasks we can do it as follows: $ hq submit --array=`hq job task-ids 5 --filter=failed,canceled` ./my-computation Note that it also works with --each-line or --from-json , i.e.: # Original computation $ hq submit --each-line=input.txt ./my-computation # Resubmitting failed jobs $ hq submit --each-line=input.txt --array=`hq job task-ids last --filter=failed` ./my-computation Task restart # Sometimes a worker might crash while it is executing some task. In that case the server will automatically reschedule that task to a different worker and the task will begin executing from the beginning. In order to let the executed application know that the same task is being executed repeatedly, HyperQueue assigns each execution a separate Instance ID . It is a 32b non-negative number that identifies each (re-)execution of a task. It is guaranteed that a newer execution of a task will have a larger instance ID, however HyperQueue explicitly does not guarantee any specific values or differences between two IDs. Each instance ID is valid only for a particular task. Two different tasks may have the same instance ID. Instance IDs can be useful e.g. when a task is restarted, and you want to distinguish the output of the first execution and the restarted execution (by default, HQ will overwrite the standard output/error file of the first execution). You can instead create a separate stdout/stderr file for each task execution using the instance ID placeholder . Task array failures # By default, when a single task of a task array fails, the computation of the job will continue. You can change this behavior with the --max-fails= option of the submit command, where X is non-negative integer. If specified, once more tasks than X tasks fail, the rest of the job's tasks that were not completed yet will be canceled. For example: $ hq submit --array 1-1000 --max-fails 5 ... This will create a task array with 1000 tasks. Once 5 or more tasks fail, the remaining uncompleted tasks of the job will be canceled.","title":"Handling Failure"},{"location":"jobs/failure/#resubmitting-array-jobs","text":"When a job fails or is canceled, you can submit it again. However, in case of task arrays , different tasks may end in different states, and often we want to recompute only tasks with a specific status (e.g. failed tasks). By following combination of commands you may recompute only failed tasks. Let us assume that we want to recompute all failed tasks in job 5: $ hq submit --array=`hq job task-ids 5 --filter=failed` ./my-computation It works as follows: Command hq job task-ids 5 --filter=failed returns IDs of failed jobs of job 5 , and we set it to --array parameter that starts only tasks for given IDs. If we want to recompute all failed tasks and all canceled tasks we can do it as follows: $ hq submit --array=`hq job task-ids 5 --filter=failed,canceled` ./my-computation Note that it also works with --each-line or --from-json , i.e.: # Original computation $ hq submit --each-line=input.txt ./my-computation # Resubmitting failed jobs $ hq submit --each-line=input.txt --array=`hq job task-ids last --filter=failed` ./my-computation","title":"Resubmitting array jobs"},{"location":"jobs/failure/#task-restart","text":"Sometimes a worker might crash while it is executing some task. In that case the server will automatically reschedule that task to a different worker and the task will begin executing from the beginning. In order to let the executed application know that the same task is being executed repeatedly, HyperQueue assigns each execution a separate Instance ID . It is a 32b non-negative number that identifies each (re-)execution of a task. It is guaranteed that a newer execution of a task will have a larger instance ID, however HyperQueue explicitly does not guarantee any specific values or differences between two IDs. Each instance ID is valid only for a particular task. Two different tasks may have the same instance ID. Instance IDs can be useful e.g. when a task is restarted, and you want to distinguish the output of the first execution and the restarted execution (by default, HQ will overwrite the standard output/error file of the first execution). You can instead create a separate stdout/stderr file for each task execution using the instance ID placeholder .","title":"Task restart"},{"location":"jobs/failure/#task-array-failures","text":"By default, when a single task of a task array fails, the computation of the job will continue. You can change this behavior with the --max-fails= option of the submit command, where X is non-negative integer. If specified, once more tasks than X tasks fail, the rest of the job's tasks that were not completed yet will be canceled. For example: $ hq submit --array 1-1000 --max-fails 5 ... This will create a task array with 1000 tasks. Once 5 or more tasks fail, the remaining uncompleted tasks of the job will be canceled.","title":"Task array failures"},{"location":"jobs/jobfile/","text":"Job Definition File # Job Definition File (JDF) a way how to submit a complex pipeline into a HyperQueue. It is a TOML file that describes tasks of a job. JDF provides all functionalities as command line interface of HyperQueue and also adds access to additional features: Heterogeneous tasks -- Job may be composed of different tasks Dependencies -- Tasks may have dependencies Resource request alternatives -- Task may have alternative resource requests, e.g.: 4 cpus OR 1 cpus and 1 gpu Note that these features are also available through Python interface. Minimal example # First, we create file with the following content: [[task]] command = [ \"sleep\" , \"1\" ] Let us assume that we have named this file as myfile.toml , then we can run the following command to submit a job: $ hq job submit-file myfile.toml The effect will be same as running: $ hq submit sleep 1 Task configuration # The following shows how job and task may be configured in more detail. All options except command are optional. If not said otherwise, an option in format xxx = ... is an equivalent of --xxx = ... in hq submit command. The default are the same as CLI interface. name = \"test-job\" stream = \"path/to/stream/dir\" # Stdout/Stderr streaming (see --stream) max_fails = 11 [[task]] stdout = \"testout-%{TASK_ID} stderr = { path = \" testerr- %{ TASK_ID } \", mode = \" rm-if-finished \" } task_dir = true time_limit = \" 1m 10s \" priority = -1 crash_limit = 12 command = [\" / bin / bash \", \" -c \", \" echo $ ABC \"] # Environment variables env = { \" ABC \" = \" 123 \", \" XYZ \" = \" aaaa \" } # Content that will be written on stdin stdin = \" Hello world ! \" [[task.request]] resources = { \" cpus \" = \" 4 compact ! \", \" gpus \" = 2 } time_request = \" 10s \" More tasks # More tasks with different configuration may be defined as follows: [[task]] command = [ \"sleep\" , \"1\" ] [[task]] command = [ \"sleep\" , \"2\" ] [[task]] command = [ \"sleep\" , \"3\" ] In the case above, tasks are given automatic task ids from id 0. You can also specify IDs manually: [[task]] id = 10 command = [ \"sleep\" , \"1\" ] [[task]] id = 11 command = [ \"sleep\" , \"2\" ] [[task]] id = 2 command = [ \"sleep\" , \"3\" ] Task arrays # If you want to create uniform tasks you can define task array (similar to --array ): [[array]] ids = \"1,2,50-100\" command = [ \"sleep\" , \"1\" ] You can also specify array with content of HQ_ENTRIES : [[array]] entries = [ \"One\" , \"Two\" , \"Three\" ] command = [ \"sleep\" , \"1\" ] Note Options entries and ids can be used together. Task dependencies # Job Definition File allows to define a dependencies between tasks. In other words, it means that the task may be executed only if the previous tasks are already finished. The task's option deps defines on which tasks the given task dependents. The task is addressed by their IDs. The following example creates three tasks where the third task depends on the first two tasks. [[task]] id = 1 command = [ ...] [[task]] id = 3 command = [ ...] [[task]] id = 5 command = [ ...] deps = [ 1 , 3 ] # <---- Dependancy on tasks 1 and 3 Resource variants # More resource configurations may be defined for a task. In this case, HyperQueue will take into account all these configurations during scheduling. When a task is started exactly one configuration is chosen. If in a given moment more configuration are possible for a given task, the configuration first defined has a higher priority. The following configuration defines that a task may be executed on 1 cpus and 1 gpu OR on 4 cpus. [[task]] command = [ ...] [[task.request]] resources = { \"cpus\" = 1 , \"gpus\" = 1 } [[task.request]] resources = { \"cpus\" = 4 } In the case that many tasks with such a configuration are submitted to a worker with 16 cpus and 4 gpus then HyperQueue will run simultaneously 4 tasks in the first configuration and 3 tasks in the second one. For a task with resource variants, HyperQueue sets variable HQ_RESOURCE_VARIANT to an index of chosen variant (counted from 0) when a task is started. Non-integer resource amounts # You may specify a resource number as float, e.g. resources = { \"foo\" = 1.5 } . It is valid but internally the type if converted to float, that may for some numbers lead to a rounding up when number is converted to 4-digit precision of resource amounts. If you want to avoid this, put the number into parentheses, e.g. resources = { \"foo\" = \"1.5\" } .","title":"Job Definition File"},{"location":"jobs/jobfile/#job-definition-file","text":"Job Definition File (JDF) a way how to submit a complex pipeline into a HyperQueue. It is a TOML file that describes tasks of a job. JDF provides all functionalities as command line interface of HyperQueue and also adds access to additional features: Heterogeneous tasks -- Job may be composed of different tasks Dependencies -- Tasks may have dependencies Resource request alternatives -- Task may have alternative resource requests, e.g.: 4 cpus OR 1 cpus and 1 gpu Note that these features are also available through Python interface.","title":"Job Definition File"},{"location":"jobs/jobfile/#minimal-example","text":"First, we create file with the following content: [[task]] command = [ \"sleep\" , \"1\" ] Let us assume that we have named this file as myfile.toml , then we can run the following command to submit a job: $ hq job submit-file myfile.toml The effect will be same as running: $ hq submit sleep 1","title":"Minimal example"},{"location":"jobs/jobfile/#task-configuration","text":"The following shows how job and task may be configured in more detail. All options except command are optional. If not said otherwise, an option in format xxx = ... is an equivalent of --xxx = ... in hq submit command. The default are the same as CLI interface. name = \"test-job\" stream = \"path/to/stream/dir\" # Stdout/Stderr streaming (see --stream) max_fails = 11 [[task]] stdout = \"testout-%{TASK_ID} stderr = { path = \" testerr- %{ TASK_ID } \", mode = \" rm-if-finished \" } task_dir = true time_limit = \" 1m 10s \" priority = -1 crash_limit = 12 command = [\" / bin / bash \", \" -c \", \" echo $ ABC \"] # Environment variables env = { \" ABC \" = \" 123 \", \" XYZ \" = \" aaaa \" } # Content that will be written on stdin stdin = \" Hello world ! \" [[task.request]] resources = { \" cpus \" = \" 4 compact ! \", \" gpus \" = 2 } time_request = \" 10s \"","title":"Task configuration"},{"location":"jobs/jobfile/#more-tasks","text":"More tasks with different configuration may be defined as follows: [[task]] command = [ \"sleep\" , \"1\" ] [[task]] command = [ \"sleep\" , \"2\" ] [[task]] command = [ \"sleep\" , \"3\" ] In the case above, tasks are given automatic task ids from id 0. You can also specify IDs manually: [[task]] id = 10 command = [ \"sleep\" , \"1\" ] [[task]] id = 11 command = [ \"sleep\" , \"2\" ] [[task]] id = 2 command = [ \"sleep\" , \"3\" ]","title":"More tasks"},{"location":"jobs/jobfile/#task-arrays","text":"If you want to create uniform tasks you can define task array (similar to --array ): [[array]] ids = \"1,2,50-100\" command = [ \"sleep\" , \"1\" ] You can also specify array with content of HQ_ENTRIES : [[array]] entries = [ \"One\" , \"Two\" , \"Three\" ] command = [ \"sleep\" , \"1\" ] Note Options entries and ids can be used together.","title":"Task arrays"},{"location":"jobs/jobfile/#task-dependencies","text":"Job Definition File allows to define a dependencies between tasks. In other words, it means that the task may be executed only if the previous tasks are already finished. The task's option deps defines on which tasks the given task dependents. The task is addressed by their IDs. The following example creates three tasks where the third task depends on the first two tasks. [[task]] id = 1 command = [ ...] [[task]] id = 3 command = [ ...] [[task]] id = 5 command = [ ...] deps = [ 1 , 3 ] # <---- Dependancy on tasks 1 and 3","title":"Task dependencies"},{"location":"jobs/jobfile/#resource-variants","text":"More resource configurations may be defined for a task. In this case, HyperQueue will take into account all these configurations during scheduling. When a task is started exactly one configuration is chosen. If in a given moment more configuration are possible for a given task, the configuration first defined has a higher priority. The following configuration defines that a task may be executed on 1 cpus and 1 gpu OR on 4 cpus. [[task]] command = [ ...] [[task.request]] resources = { \"cpus\" = 1 , \"gpus\" = 1 } [[task.request]] resources = { \"cpus\" = 4 } In the case that many tasks with such a configuration are submitted to a worker with 16 cpus and 4 gpus then HyperQueue will run simultaneously 4 tasks in the first configuration and 3 tasks in the second one. For a task with resource variants, HyperQueue sets variable HQ_RESOURCE_VARIANT to an index of chosen variant (counted from 0) when a task is started.","title":"Resource variants"},{"location":"jobs/jobfile/#non-integer-resource-amounts","text":"You may specify a resource number as float, e.g. resources = { \"foo\" = 1.5 } . It is valid but internally the type if converted to float, that may for some numbers lead to a rounding up when number is converted to 4-digit precision of resource amounts. If you want to avoid this, put the number into parentheses, e.g. resources = { \"foo\" = \"1.5\" } .","title":"Non-integer resource amounts"},{"location":"jobs/jobs/","text":"The main unit of computation within HyperQueue is called a Task . It represents a single computation (currently, a single execution of some program) that is scheduled and executed on a worker. To actually compute something, you have to create a Job , which is a collection of tasks (a task graph). Jobs are units of computation management - you can submit, query or cancel jobs using the CLI. Note This section focuses on simple jobs , where each job contains exactly one task. See Task arrays to find out how to create jobs with multiple tasks. Identification numbers # Each job is identified by a positive integer that is assigned by the HyperQueue server when the job is submitted. We refer to it as Job id . Each task within a job is identified by an unsigned 32b integer called Task id . Task id is either generated by the server or assigned by the user. Task ids are always relative to a specific job, two tasks inside different jobs can thus have the same task id. In simple jobs, task id is always set to 0 . Submitting jobs # To submit a simple job that will execute some executable with the provided arguments, use the hq submit command: $ hq submit ... When you submit a job, the server will assign it a unique job id and print it. You can use this ID in following commands to refer to the submitted job. After the job is submitted, HyperQueue will distribute it to a connected worker that will then execute the provided command. Warning The provided command will be executed on a worker that might be running on a different machine. You should thus make sure that the binary will be available there and that you provide an absolute path to it. Note When your command contains its own command line flags, you must put the command and its flags after -- : $ hq submit -- /bin/bash -c 'echo $PPID' There are many parameters that you can set for the executed program, they are listed below. Name # Each job has an assigned name. It has only an informative character for the user. By default, the name is derived from the job's program name. You can also set the job name explicitly with the --name option: $ hq submit --name = ... Working directory # By default, the working directory of the job will be set to the directory from which the job was submitted. You can change this using the --cwd option: $ hq submit --cwd = ... Warning Make sure that the provided path exists on all worker nodes. Hint You can use placeholders in the working directory path. Output # By default, each job will produce two files containing the standard output and standard error output, respectively. The default paths of these files are %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stdout for stdout %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stderr for stderr %{JOB_ID} and %{TASK_ID} are so-called placeholders, you can read about them below . You can change these paths with the --stdout and --stderr options. You can also avoid creating stdout / stderr files completely by setting the value to none : Change output paths Disable stdout $ hq submit --stdout = out.txt --stderr = err.txt ... $ hq submit --stdout = none ... Warning Make sure that the provided path(s) exist on all worker nodes. Also note that if you provide a relative path, it will be resolved relative to the directory from where you submit the job, not relative to the working directory of the job. If you want to change that, use the %{CWD} placeholder . Environment variables # You can set environment variables which will be passed to the provided command when the job is executed using the --env = option. Multiple environment variables can be passed if you repeat the option. $ hq submit --env KEY1 = VAL1 --env KEY2 = VAL2 ... Each executed task will also automatically receive the following environment variables: Variable name Explanation HQ_JOB_ID Job id HQ_TASK_ID Task id HQ_INSTANCE_ID Instance id HQ_RESOURCE_... A set of variables related to allocated resources Time management # You can specify two time-related parameters when submitting a job. They will be applied to each task of the submitted job. Time Limit is the maximal running time of a task. If it is reached, the task will be terminated, and it will transition into the Failed state . This setting has no impact on scheduling. This can serve as a sanity check to make sure that some task will not run indefinitely. You can set it with the --time-limit option 1 : $ hq submit --time-limit = ... Note Time limit is counted separately for each task. If you set a time limit of 3 minutes and create two tasks, where each will run for two minutes, the time limit will not be hit. Time Request is the minimal remaining lifetime that a worker must have in order to start executing the task. Workers that do not have enough remaining lifetime will not be considered for running this task. Time requests are only used during scheduling, where the server decides which worker should execute which task. Once a task is scheduled and starts executing on a worker, the time request value will not have any effect. You can set the time request using the --time-request option 1 : $ hq submit --time-request = ... Note Workers with an unknown remaining lifetime will be able to execute any task, disregarding its time request. Here is an example situation where time limit and time request can be used: Let's assume that we have a collection of tasks where the vast majority of tasks usually finish within 10 minutes, but some of them run for (at most) 30 minutes. We do not know in advance which tasks will be \"slow\". In this case we may want to set the time limit to 35 minutes to protect us against an error (deadlock, endless loop, etc.). However, since we know that each task will usually take at least 10 minutes to execute, we don't want to start executing it on a worker if we know that the worker will definitely terminate in less than 10 minutes. It would only cause unnecessary lost computational resources. Therefore, we can set the time request to 10 minutes. Priority # You can modify the order in which tasks are executed using Priority . Priority can be any 32b signed integer. A lower number signifies lower priority, e.g. when task A with priority 5 and task B with priority 3 are scheduled to the same worker and only one of them may be executed, then A will be executed first. You can set the priority using the --priority option: $hq submit --priority = If no priority is specified, then each task will have priority 0 . Placeholders # You can use special variables when setting certain job parameters ( working directory , output paths, log path). These variables, called Placeholders , will be replaced by job or task-specific information before the job is executed. Placeholders are enclosed in curly braces ( {} ) and prefixed with a percent ( % ) sign. You can use the following placeholders: Placeholder Will be replaced by Available for %{JOB_ID} Job ID stdout , stderr , cwd , log %{TASK_ID} Task ID stdout , stderr , cwd %{INSTANCE_ID} Instance ID stdout , stderr , cwd %{SUBMIT_DIR} Directory from which the job was submitted. stdout , stderr , cwd , log %{CWD} Working directory of the task. stdout , stderr %{SERVER_UID} Unique server ID. stdout , stderr , cwd , log SERVER_UID is a random string that is unique for each new server execution (each hq server start gets a separate value). As an example, if you wanted to include the Instance ID in the stdout path (to distinguish the individual outputs of restarted tasks), you can use placeholders like this: $ hq submit --stdout '%{CWD}/job-%{JOB_ID}/%{TASK_ID}-%{INSTANCE_ID}.stdout' ... State # At any moment in time, each task and job has a specific state that represents what is currently happening to it. You can query the state of a job with the following command 2 : $ hq job info Task state # Each task starts in the Waiting state and can end up in one of the terminal states: Finished , Failed or Canceled . Waiting-----------------\\ | ^ | | | | v | | Running-----------------| | | | | \\--------\\ | | | | v v v Finished Failed Canceled Waiting The task was submitted and is now waiting to be executed. Running The task is running on a worker. It may become Waiting again when the worker where the task is running crashes. Finished The task has successfully finished. Failed The task has failed. Canceled The task has been canceled . If a task is in the Finished , Failed or Canceled state, it is completed . Job state # The state of a job is derived from the states of its individual tasks. The state is determined by the first rule that matches from the following list of rules: If at least one task is Running , then job state is Running . If at least one task has not been completed yet, then job state is Waiting . If at least one task is Failed , then job state is Failed . If at least one task is Canceled , then job state is Canceled . If all tasks are finished and job is open (see Open Jobs ), then job state is Opened . Remaining case: all tasks are Finished and job is closed, then job state is Finished . Cancelling jobs # You can prematurely terminate a submitted job that haven't been completed yet by cancelling it using the hq job cancel command 2 : $ hq job cancel Cancelling a job will cancel all of its tasks that are not yet completed. Forgetting jobs # If you want to completely forget a job, and thus free up its associated memory, you can do that using the hq job forget command 2 : $ hq job forget By default, all completed jobs (finished/failed/canceled) will be forgotten. You can use the --status parameter to only forget jobs in certain statuses: $ hq job forget all --status finished,canceled However, only jobs that are completed, i.e. that have been finished successfully, failed or have been canceled, can be forgotten. If you want to forget a waiting or a running job, cancel it first. Waiting for jobs # There are three ways of waiting until a job completes: Submit and wait You can use the --wait flag when submitting a job. This will cause the submission command to wait until the job becomes complete: $ hq submit --wait ... Tip This method can be used for benchmarking the job duration. Wait command There is a separate hq job wait command that can be used to wait until an existing job completes 2 : $ hq job wait Interactive wait If you want to interactively observe the status of a job (which is useful especially if it has multiple tasks ), you can use the hq job progress command: Submit and observe Observe an existing job 2 $ hq submit --progress ... $ hq job progress Attaching standard input # When --stdin flag is used, HQ captures standard input and attaches it to each task of a job. When a task is started then the attached data is written into the standard input of the task. This can be used to submitting scripts without creating file. The following command will capture stdin and executes it in Bash $ hq submit --stdin bash If you want to parse #HQ directives from standard input, you can use --directives=stdin . Task directory # When a job is submitted with --task-dir then a temporary directory is created for each task and passed via environment variable HQ_TASK_DIR . This directory is automatically deleted when the task is completed (for any reason). Providing own error message # A task may pass its own error message into the HyperQueue. HyperQueue provides a filename via environment variable HQ_ERROR_FILENAME , if a task creates this file and terminates with a non-zero return code, then the content of this file is taken as an error message. HQ_ERROR_FILENAME is provided only if task directory is set on. The filename is always placed inside the task directory. If the message is longer than 2KiB, then it is truncated to 2KiB. If task terminates with zero return code, then the error file is ignored. Automatic file cleanup # If you create a lot of tasks and do not use output streaming , a lot of stdout / stderr files can be created on the disk. In certain cases, you might not be interested in the contents of these files, especially if the task has finished successfully, and you instead want to remove them as soon as they are not needed. For that, you can use a file cleanup mode when specifying stdout and/or stderr to choose what should happen with the file when its task finishes. The mode is specified as a name following a colon ( : ) after the file path. Currently, one cleanup mode is implemented: Remove the file if the task has finished successfully: $ hq submit --stdout = \"out.txt:rm-if-finished\" /my-program The file will not be deleted if the task fails or is cancelled. Note If you want to use the default stdout / stderr file path (and you don't want to look it up), you can also specify just the cleanup mode without the file path: $ hq submit --stdout = \":rm-if-finished\" /my-program Useful job commands # Here is a list of useful job commands: Display job table # List queued and running jobs List all jobs List jobs by status $ hq job list $ hq job list --all You can display only jobs having the selected states by using the --filter flag: $ hq job list --filter running,waiting Valid filter values are: waiting running finished failed canceled Display a summary table of all jobs # $ hq job summary Display information about a specific job # $ hq job info Display information about individual tasks (potentially across multiple jobs) # $ hq task list [--task-status ] [--tasks ] Display job stdout / stderr # $ hq job cat [--tasks ] Crashing limit # When a worker is lost then all running tasks on the worker are suspicious that they may cause the crash of the worker. HyperQueue server remembers how many times were a task running while a worker is lost. If the count reaches the limit, then the task is set to the failed state. By default, this limit is 5 but it can be changed as follows: $ hq submit --crash-limit= ... If the limit is set to 0, then the limit is disabled. You can use various shortcuts for the duration value. \u21a9 \u21a9 You can use various shortcuts to select multiple jobs at once. \u21a9 \u21a9 \u21a9 \u21a9 \u21a9","title":"Jobs and Tasks"},{"location":"jobs/jobs/#identification-numbers","text":"Each job is identified by a positive integer that is assigned by the HyperQueue server when the job is submitted. We refer to it as Job id . Each task within a job is identified by an unsigned 32b integer called Task id . Task id is either generated by the server or assigned by the user. Task ids are always relative to a specific job, two tasks inside different jobs can thus have the same task id. In simple jobs, task id is always set to 0 .","title":"Identification numbers"},{"location":"jobs/jobs/#submitting-jobs","text":"To submit a simple job that will execute some executable with the provided arguments, use the hq submit command: $ hq submit ... When you submit a job, the server will assign it a unique job id and print it. You can use this ID in following commands to refer to the submitted job. After the job is submitted, HyperQueue will distribute it to a connected worker that will then execute the provided command. Warning The provided command will be executed on a worker that might be running on a different machine. You should thus make sure that the binary will be available there and that you provide an absolute path to it. Note When your command contains its own command line flags, you must put the command and its flags after -- : $ hq submit -- /bin/bash -c 'echo $PPID' There are many parameters that you can set for the executed program, they are listed below.","title":"Submitting jobs"},{"location":"jobs/jobs/#name","text":"Each job has an assigned name. It has only an informative character for the user. By default, the name is derived from the job's program name. You can also set the job name explicitly with the --name option: $ hq submit --name = ...","title":"Name"},{"location":"jobs/jobs/#working-directory","text":"By default, the working directory of the job will be set to the directory from which the job was submitted. You can change this using the --cwd option: $ hq submit --cwd = ... Warning Make sure that the provided path exists on all worker nodes. Hint You can use placeholders in the working directory path.","title":"Working directory"},{"location":"jobs/jobs/#output","text":"By default, each job will produce two files containing the standard output and standard error output, respectively. The default paths of these files are %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stdout for stdout %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stderr for stderr %{JOB_ID} and %{TASK_ID} are so-called placeholders, you can read about them below . You can change these paths with the --stdout and --stderr options. You can also avoid creating stdout / stderr files completely by setting the value to none : Change output paths Disable stdout $ hq submit --stdout = out.txt --stderr = err.txt ... $ hq submit --stdout = none ... Warning Make sure that the provided path(s) exist on all worker nodes. Also note that if you provide a relative path, it will be resolved relative to the directory from where you submit the job, not relative to the working directory of the job. If you want to change that, use the %{CWD} placeholder .","title":"Output"},{"location":"jobs/jobs/#environment-variables","text":"You can set environment variables which will be passed to the provided command when the job is executed using the --env = option. Multiple environment variables can be passed if you repeat the option. $ hq submit --env KEY1 = VAL1 --env KEY2 = VAL2 ... Each executed task will also automatically receive the following environment variables: Variable name Explanation HQ_JOB_ID Job id HQ_TASK_ID Task id HQ_INSTANCE_ID Instance id HQ_RESOURCE_... A set of variables related to allocated resources","title":"Environment variables"},{"location":"jobs/jobs/#time-management","text":"You can specify two time-related parameters when submitting a job. They will be applied to each task of the submitted job. Time Limit is the maximal running time of a task. If it is reached, the task will be terminated, and it will transition into the Failed state . This setting has no impact on scheduling. This can serve as a sanity check to make sure that some task will not run indefinitely. You can set it with the --time-limit option 1 : $ hq submit --time-limit = ... Note Time limit is counted separately for each task. If you set a time limit of 3 minutes and create two tasks, where each will run for two minutes, the time limit will not be hit. Time Request is the minimal remaining lifetime that a worker must have in order to start executing the task. Workers that do not have enough remaining lifetime will not be considered for running this task. Time requests are only used during scheduling, where the server decides which worker should execute which task. Once a task is scheduled and starts executing on a worker, the time request value will not have any effect. You can set the time request using the --time-request option 1 : $ hq submit --time-request = ... Note Workers with an unknown remaining lifetime will be able to execute any task, disregarding its time request. Here is an example situation where time limit and time request can be used: Let's assume that we have a collection of tasks where the vast majority of tasks usually finish within 10 minutes, but some of them run for (at most) 30 minutes. We do not know in advance which tasks will be \"slow\". In this case we may want to set the time limit to 35 minutes to protect us against an error (deadlock, endless loop, etc.). However, since we know that each task will usually take at least 10 minutes to execute, we don't want to start executing it on a worker if we know that the worker will definitely terminate in less than 10 minutes. It would only cause unnecessary lost computational resources. Therefore, we can set the time request to 10 minutes.","title":"Time management"},{"location":"jobs/jobs/#priority","text":"You can modify the order in which tasks are executed using Priority . Priority can be any 32b signed integer. A lower number signifies lower priority, e.g. when task A with priority 5 and task B with priority 3 are scheduled to the same worker and only one of them may be executed, then A will be executed first. You can set the priority using the --priority option: $hq submit --priority = If no priority is specified, then each task will have priority 0 .","title":"Priority"},{"location":"jobs/jobs/#placeholders","text":"You can use special variables when setting certain job parameters ( working directory , output paths, log path). These variables, called Placeholders , will be replaced by job or task-specific information before the job is executed. Placeholders are enclosed in curly braces ( {} ) and prefixed with a percent ( % ) sign. You can use the following placeholders: Placeholder Will be replaced by Available for %{JOB_ID} Job ID stdout , stderr , cwd , log %{TASK_ID} Task ID stdout , stderr , cwd %{INSTANCE_ID} Instance ID stdout , stderr , cwd %{SUBMIT_DIR} Directory from which the job was submitted. stdout , stderr , cwd , log %{CWD} Working directory of the task. stdout , stderr %{SERVER_UID} Unique server ID. stdout , stderr , cwd , log SERVER_UID is a random string that is unique for each new server execution (each hq server start gets a separate value). As an example, if you wanted to include the Instance ID in the stdout path (to distinguish the individual outputs of restarted tasks), you can use placeholders like this: $ hq submit --stdout '%{CWD}/job-%{JOB_ID}/%{TASK_ID}-%{INSTANCE_ID}.stdout' ...","title":"Placeholders"},{"location":"jobs/jobs/#state","text":"At any moment in time, each task and job has a specific state that represents what is currently happening to it. You can query the state of a job with the following command 2 : $ hq job info ","title":"State"},{"location":"jobs/jobs/#task-state","text":"Each task starts in the Waiting state and can end up in one of the terminal states: Finished , Failed or Canceled . Waiting-----------------\\ | ^ | | | | v | | Running-----------------| | | | | \\--------\\ | | | | v v v Finished Failed Canceled Waiting The task was submitted and is now waiting to be executed. Running The task is running on a worker. It may become Waiting again when the worker where the task is running crashes. Finished The task has successfully finished. Failed The task has failed. Canceled The task has been canceled . If a task is in the Finished , Failed or Canceled state, it is completed .","title":"Task state"},{"location":"jobs/jobs/#job-state","text":"The state of a job is derived from the states of its individual tasks. The state is determined by the first rule that matches from the following list of rules: If at least one task is Running , then job state is Running . If at least one task has not been completed yet, then job state is Waiting . If at least one task is Failed , then job state is Failed . If at least one task is Canceled , then job state is Canceled . If all tasks are finished and job is open (see Open Jobs ), then job state is Opened . Remaining case: all tasks are Finished and job is closed, then job state is Finished .","title":"Job state"},{"location":"jobs/jobs/#cancelling-jobs","text":"You can prematurely terminate a submitted job that haven't been completed yet by cancelling it using the hq job cancel command 2 : $ hq job cancel Cancelling a job will cancel all of its tasks that are not yet completed.","title":"Cancelling jobs"},{"location":"jobs/jobs/#forgetting-jobs","text":"If you want to completely forget a job, and thus free up its associated memory, you can do that using the hq job forget command 2 : $ hq job forget By default, all completed jobs (finished/failed/canceled) will be forgotten. You can use the --status parameter to only forget jobs in certain statuses: $ hq job forget all --status finished,canceled However, only jobs that are completed, i.e. that have been finished successfully, failed or have been canceled, can be forgotten. If you want to forget a waiting or a running job, cancel it first.","title":"Forgetting jobs"},{"location":"jobs/jobs/#waiting-for-jobs","text":"There are three ways of waiting until a job completes: Submit and wait You can use the --wait flag when submitting a job. This will cause the submission command to wait until the job becomes complete: $ hq submit --wait ... Tip This method can be used for benchmarking the job duration. Wait command There is a separate hq job wait command that can be used to wait until an existing job completes 2 : $ hq job wait Interactive wait If you want to interactively observe the status of a job (which is useful especially if it has multiple tasks ), you can use the hq job progress command: Submit and observe Observe an existing job 2 $ hq submit --progress ... $ hq job progress ","title":"Waiting for jobs"},{"location":"jobs/jobs/#attaching-standard-input","text":"When --stdin flag is used, HQ captures standard input and attaches it to each task of a job. When a task is started then the attached data is written into the standard input of the task. This can be used to submitting scripts without creating file. The following command will capture stdin and executes it in Bash $ hq submit --stdin bash If you want to parse #HQ directives from standard input, you can use --directives=stdin .","title":"Attaching standard input"},{"location":"jobs/jobs/#task-directory","text":"When a job is submitted with --task-dir then a temporary directory is created for each task and passed via environment variable HQ_TASK_DIR . This directory is automatically deleted when the task is completed (for any reason).","title":"Task directory"},{"location":"jobs/jobs/#providing-own-error-message","text":"A task may pass its own error message into the HyperQueue. HyperQueue provides a filename via environment variable HQ_ERROR_FILENAME , if a task creates this file and terminates with a non-zero return code, then the content of this file is taken as an error message. HQ_ERROR_FILENAME is provided only if task directory is set on. The filename is always placed inside the task directory. If the message is longer than 2KiB, then it is truncated to 2KiB. If task terminates with zero return code, then the error file is ignored.","title":"Providing own error message"},{"location":"jobs/jobs/#automatic-file-cleanup","text":"If you create a lot of tasks and do not use output streaming , a lot of stdout / stderr files can be created on the disk. In certain cases, you might not be interested in the contents of these files, especially if the task has finished successfully, and you instead want to remove them as soon as they are not needed. For that, you can use a file cleanup mode when specifying stdout and/or stderr to choose what should happen with the file when its task finishes. The mode is specified as a name following a colon ( : ) after the file path. Currently, one cleanup mode is implemented: Remove the file if the task has finished successfully: $ hq submit --stdout = \"out.txt:rm-if-finished\" /my-program The file will not be deleted if the task fails or is cancelled. Note If you want to use the default stdout / stderr file path (and you don't want to look it up), you can also specify just the cleanup mode without the file path: $ hq submit --stdout = \":rm-if-finished\" /my-program","title":"Automatic file cleanup"},{"location":"jobs/jobs/#useful-job-commands","text":"Here is a list of useful job commands:","title":"Useful job commands"},{"location":"jobs/jobs/#display-job-table","text":"List queued and running jobs List all jobs List jobs by status $ hq job list $ hq job list --all You can display only jobs having the selected states by using the --filter flag: $ hq job list --filter running,waiting Valid filter values are: waiting running finished failed canceled","title":"Display job table"},{"location":"jobs/jobs/#display-a-summary-table-of-all-jobs","text":"$ hq job summary","title":"Display a summary table of all jobs"},{"location":"jobs/jobs/#display-information-about-a-specific-job","text":"$ hq job info ","title":"Display information about a specific job"},{"location":"jobs/jobs/#display-information-about-individual-tasks-potentially-across-multiple-jobs","text":"$ hq task list [--task-status ] [--tasks ]","title":"Display information about individual tasks (potentially across multiple jobs)"},{"location":"jobs/jobs/#display-job-stdoutstderr","text":"$ hq job cat [--tasks ] ","title":"Display job stdout/stderr"},{"location":"jobs/jobs/#crashing-limit","text":"When a worker is lost then all running tasks on the worker are suspicious that they may cause the crash of the worker. HyperQueue server remembers how many times were a task running while a worker is lost. If the count reaches the limit, then the task is set to the failed state. By default, this limit is 5 but it can be changed as follows: $ hq submit --crash-limit= ... If the limit is set to 0, then the limit is disabled. You can use various shortcuts for the duration value. \u21a9 \u21a9 You can use various shortcuts to select multiple jobs at once. \u21a9 \u21a9 \u21a9 \u21a9 \u21a9","title":"Crashing limit"},{"location":"jobs/multinode/","text":"Warning Multi-node support is now in the experimental stage. The core functionality is working, but some features may be limited and quality of scheduling may vary. Also auto allocation feature is not yet fully prepared for multi-node tasks. Multi-node tasks are tasks that spreads across multiple nodes. Each node reserved for such task is exclusively reserved, i.e. no other tasks may run on such nodes. A job with multi-node task can be specified by --nodes=X option. An example of a job with multi-node task asking for 4 nodes: $ hq submit --nodes 4 test.sh When the task is started, four nodes are assigned to this task. One of them is chosen as \"root\" node where test.sh is started. Node names of all assigned nodes can be found in file which path is in environmental variable HQ_NODE_FILE . Each line is a node name. The first line is always the root node. The node is a short hostname, i.e. hostname stripped by a suffix after first \".\" (e.g. if a hostname of worker is \"cn690.karolina.it4i.cz\" then node name is \"cn690\"). Many HPC applications use only short hostnames, hence we provide them as default. If you need a full hostnames, there is file which name is written in HQ_HOST_FILE and it has the same meaning as NQ_NODE_FILE but contains the full node hostnames without stripping. Note: Both files are placed in task directory; therefore, a multi-node tasks always enables task directory ( --task-dir ). If a multinode task is started, HQ also creates variable HQ_NUM_NODES that holds the number of nodes assigned to a task (i.e. the number of lines of the node file) Groups # A multi-node task is started only on workers that belong to the same group. By default, workers are grouped by PBS/Slurm allocations and workers outside any allocation are put in \"default\" group. A group of a worker can be specified at the start of the worker and it may be any string. Example: $ hq worker start --group my_group Running MPI tasks # A script that starts an MPI program in multi-node task may look like as follows: #!/bin/sh mpirun --node-list = $HQ_NODE_FILE ./a-program If you are running SLURM you should start the MPI program as follows: #!/bin/sh srun --nodefile=$HQ_NODE_FILE --nodes=$HQ_NUM_NODES mpirun ... Note: It is important to set --nodes otherwise the node file will not be respected.","title":"Multinode Tasks"},{"location":"jobs/multinode/#groups","text":"A multi-node task is started only on workers that belong to the same group. By default, workers are grouped by PBS/Slurm allocations and workers outside any allocation are put in \"default\" group. A group of a worker can be specified at the start of the worker and it may be any string. Example: $ hq worker start --group my_group","title":"Groups"},{"location":"jobs/multinode/#running-mpi-tasks","text":"A script that starts an MPI program in multi-node task may look like as follows: #!/bin/sh mpirun --node-list = $HQ_NODE_FILE ./a-program If you are running SLURM you should start the MPI program as follows: #!/bin/sh srun --nodefile=$HQ_NODE_FILE --nodes=$HQ_NUM_NODES mpirun ... Note: It is important to set --nodes otherwise the node file will not be respected.","title":"Running MPI tasks"},{"location":"jobs/openjobs/","text":"Open jobs # By default, a job is a set of tasks that are created atomically during a submit, and no other task can be added to the job. We call this job closed . In contrast, HQ allows you to create an open job that allows new tasks to be submitted as long as it is open. Opening a job # A job can be opened by the following command: $ hq job open If opening was successful, this will be printed: Job is open. If you want to get just ID without any additional text, you can open job as follows: $ hq --output-mode=quiet job open Note: In the list of jobs, an open job is marked with \"*\" before the id. Submitting tasks into open jobs # A submit to an open job is the same as a normal submit, except that you must specify the job you are submitting to with the --job argument. You may submit multiple times into the same job. Tasks are scheduled to the workers immediately when they are received by the server. $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ... Task Ids # All tasks in one job share the task ID space. When you do not specify task ids, HQ automatically assigns a smallest ID that is bigger then any existing task id. $ hq job open $ hq submit --job -- hostname # Task ID is 0 $ hq submit --job -- hostname # Task ID is 1 # Task IDs are 2, 3, 4 ... $ hq submit --job --each-line='test.txt' -- do-something If you are explicitly specifying task IDs, it is an error if task ID is reused: $ hq submit --job -- hostname # Task ID is 0 # This is Ok $ hq submit --job --array 10-20 -- hostname # This fails: Task ID 0 and 10, 11, 12 already exist $ hq submit --job --array 0-12 -- hostname Job name and --max-fails # Job's name and configuration open --max-fails are the property of the job. They can be set when job is opened and they cannot be later changed. Submit options --name and --max-fails cannot be used if you are submitting into an open job. # Configuring jobs's name and max fails $ hq job open --name=MyOpenJob --max-fails=10 # Submit fails becase --max-fails cannot be used together with --job $ hq submit --job --max-fails=5 ... Submit file into open job # Submitting job definition file into an open job works in the similar way as a normal submit, you just need to add --job parameter. $ hq job submit-file --job job-definition.toml Closing job # You can close a job by calling: $ hq job close When a job is closed, you are not allowed to submit any more tasks to the job. It has no effect on tasks already submitted to the job; they continue to be processed as usual. Closing of already closed job throws an error. Leaving open jobs has no overhead, but it does affect the semantics of job completion. A job is considered completed when all tasks have been completed and the job is closed . Therefore, hq job wait ... will wait until all tasks of the selected jobs are complete and the jobs are closed. If you want to wait only for completion of tasks and ignoring if job is open or closed then there is hq job wait --without-close ... .","title":"Open jobs"},{"location":"jobs/openjobs/#open-jobs","text":"By default, a job is a set of tasks that are created atomically during a submit, and no other task can be added to the job. We call this job closed . In contrast, HQ allows you to create an open job that allows new tasks to be submitted as long as it is open.","title":"Open jobs"},{"location":"jobs/openjobs/#opening-a-job","text":"A job can be opened by the following command: $ hq job open If opening was successful, this will be printed: Job is open. If you want to get just ID without any additional text, you can open job as follows: $ hq --output-mode=quiet job open Note: In the list of jobs, an open job is marked with \"*\" before the id.","title":"Opening a job"},{"location":"jobs/openjobs/#submitting-tasks-into-open-jobs","text":"A submit to an open job is the same as a normal submit, except that you must specify the job you are submitting to with the --job argument. You may submit multiple times into the same job. Tasks are scheduled to the workers immediately when they are received by the server. $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ...","title":"Submitting tasks into open jobs"},{"location":"jobs/openjobs/#task-ids","text":"All tasks in one job share the task ID space. When you do not specify task ids, HQ automatically assigns a smallest ID that is bigger then any existing task id. $ hq job open $ hq submit --job -- hostname # Task ID is 0 $ hq submit --job -- hostname # Task ID is 1 # Task IDs are 2, 3, 4 ... $ hq submit --job --each-line='test.txt' -- do-something If you are explicitly specifying task IDs, it is an error if task ID is reused: $ hq submit --job -- hostname # Task ID is 0 # This is Ok $ hq submit --job --array 10-20 -- hostname # This fails: Task ID 0 and 10, 11, 12 already exist $ hq submit --job --array 0-12 -- hostname","title":"Task Ids"},{"location":"jobs/openjobs/#job-name-and-max-fails","text":"Job's name and configuration open --max-fails are the property of the job. They can be set when job is opened and they cannot be later changed. Submit options --name and --max-fails cannot be used if you are submitting into an open job. # Configuring jobs's name and max fails $ hq job open --name=MyOpenJob --max-fails=10 # Submit fails becase --max-fails cannot be used together with --job $ hq submit --job --max-fails=5 ...","title":"Job name and --max-fails"},{"location":"jobs/openjobs/#submit-file-into-open-job","text":"Submitting job definition file into an open job works in the similar way as a normal submit, you just need to add --job parameter. $ hq job submit-file --job job-definition.toml","title":"Submit file into open job"},{"location":"jobs/openjobs/#closing-job","text":"You can close a job by calling: $ hq job close When a job is closed, you are not allowed to submit any more tasks to the job. It has no effect on tasks already submitted to the job; they continue to be processed as usual. Closing of already closed job throws an error. Leaving open jobs has no overhead, but it does affect the semantics of job completion. A job is considered completed when all tasks have been completed and the job is closed . Therefore, hq job wait ... will wait until all tasks of the selected jobs are complete and the jobs are closed. If you want to wait only for completion of tasks and ignoring if job is open or closed then there is hq job wait --without-close ... .","title":"Closing job"},{"location":"jobs/resources/","text":"Resource management # Resource management serves for defining arbitrary resources provided by workers and also corresponding resource requests required by tasks. HyperQueue will take care of matching task resource requests so that only workers that can fulfill them will be able to execute such tasks. Some generic resources are automatically detected ; however, users may also define their own resources. From version 0.13.0, CPUs are also managed as other resources, but they have still some extra functionality; therefore, there is a special section about CPU resources . Important Resources in HyperQueue exist on a purely logical level. They can correspond to physical things (like GPUs), but it is the responsibility of the user to make sure that this correspondence makes sense. With exception of CPUs, HyperQueue by itself does not attach any semantics to resources, they are just numbers used for scheduling. Worker resources # Each worker has one or mores resources attached. Each resource is a resource pool identified by a name. A resource pool represents some resources provided by a worker; each task can then ask for a part of the resources contained in that pool. There are two kinds of resource pools: Indexed pool : This pool represents an enumerated set of resources represented by strings. Each resource has its own identity. Tasks do not ask for specific values from the set, they just specify how many resources they require and HyperQueue will allocate the specified amount of resources from the pool for each task. This pool is useful for resources that have their own identity, for example individual GPU or FPGA accelerators. HyperQueue guarantees that no individual resource from the indexed pool is allocated to more than a single task at any given time and that a task will not be executed on a worker if it does not currently have enough individual resources to fulfill the resource request of the task. Indexed pool can be defined with groups where indices live in separated groups. Task may then ask for different allocation policies (e.g. use resources from the same or different groups). The main purpose of this is to capture NUMA architectures, each group then represents a socket with cores. Sum pool : This pool represents a resource that has a certain size which is split into individual tasks. A typical example is memory; if a worker has 2000 bytes of memory, it can serve e.g. four tasks, if each task asks for 500 bytes of memory. HyperQueue guarantees that the sum of resource request sizes of running tasks on a worker does not exceed the total size of the sum pool. Specifying worker resources # You can specify the resource pools of a worker when you start it: $ hq worker start --resource \"=\" --resource \"=\" ... where NAMEi is a name (string ) of the i -th resource pool and DEFi is a definition of the i-th resource pool. You can define resource pools using one of the following formats: [, , ..., ] where VALUE is a string. This defines a an indexed pool with the given values. If you need to enter a string resource that contains special characters ( [ , ] , , , whitespace), you can wrap the value in quotes: [\"foo [,]\", bar, \"my resource\"] . range(-) where START and END are non-negative integers. This defines an indexed pool with numbers in the inclusive range [START, END] . [[, ..., ], [, ..., ], ...] where VALUE is a string. This defines an indexed pool where indices are grouped. x Creates indexed pool with N groups of size M, indices are indexed from 0, (e.g. \"2x3\" is equivalent to [[0, 1, 2], [3, 4, 5] ) sum() where SIZE is a positive integer. This defines a sum pool with the given size. Tip You might encounter a problem in your shell when you try to specify worker resources, because the definition contains parentheses ( () ). In that case just wrap the resource definition in quotes, like this: $ hq worker start --resource \"foo=sum(5)\" Resource names # Resource names are restricted by the following rules: They can only contain ASCII letters and digits ( a-z , A-Z , 0-9 ) and the slash ( / ) symbol. They need to begin with an ASCII letter. These restrictions exist because the resource names are passed as environment variable names to tasks, which often execute shell scripts. However, shells typically do not support environment variables containing anything else than ASCII letters, digits and the underscore symbol. Therefore, HQ limits resource naming to align with the behaviour of the shell. Important HQ will normalize the resource name when passing environment variables to a task (see below ). Automatically detected resources # The following resources are detected automatically if a resource of a given name is not explicitly defined. CPUs are automatically detected as resource named \"cpus\" (more in CPU resources ). GPUs that are available when a worker is started are automatically detected under the following resource names: NVIDIA GPUs are stored the under resource name gpus/nvidia . These GPUs are detected from the environment variable CUDA_VISIBLE_DEVICES or from the procfs filesystem. AMD GPUs are stored under the resource name gpus/amd . These GPUs are detected from the environment variable ROCR_VISIBLE_DEVICES . You can set these environment variables when starting a worker to override the list of available GPUs: $ CUDA_VISIBLE_DEVICES = 2 ,3 hq worker start # The worker will have resource gpus/nvidia=[2,3] RAM of the node is detected as resource \"mem\" in megabytes; i.e. --resource mem=100 asks for 100 MiBs of the memory. If you want to see how is your system seen by a worker without actually starting it, you can start: $ hq worker hwdetect The automatic detection of resources can be disabled by argument --no-detect-resources in hq worker start ... . It disables detection of resources other than \"cpus\"; if resource \"cpus\" are not explicitly defined, it will always be detected. Resource request # When you submit a job, you can define a resource requests with the --resource flag: $ hq submit --resource = --resource = ... Where NAME is a name of the requested resource and the AMOUNT is a positive number defining the size of the request. Tasks with such resource requests will only be executed on workers that fulfill all the specified task requests. Important Notice that task resource requests always ask for an amount of resources required by a task, regardless whether that resource corresponds to an indexed or a sum pool on workers. For example, let's say that a worker has an indexed pool of GPUs: $ hq worker start --resource \"gpus/nvidia=range(1-3)\" And we create two jobs, each with a single task. The first job wants 1 GPU, the second one wants two GPUs. $ hq submit --resource gpus/nvidia = 1 ... $ hq submit --resource gpus/nvidia = 2 ... Then the first job can be allocated e.g. the GPU 2 and the second job can be allocated the GPUs 1 and 3 . Requesting all resources # A task may ask for all given resources of that type by specifying --resource =all . Such a task will be scheduled only on a worker that has at least 1 of such resource and when a task is executed all resources of that type will be given to this task. Resource request strategies # When resource request is defined, after the amount you can define allocation strategy: --resource =\" \" . Specifying strategy has effect only if worker provides indexed resource in groups. If resource is other type, then strategy is ignored. When strategy is not defined then compact is used as default. Compact ( compact ) - Tries to allocate indices in few groups as possible in the current worker state. Example: $ hq submit --resource cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocate indices on as few groups as possible for a target node. The task is not executed until the requirement could not be fully fulfilled. E.g. If a worker has 4 indices per a group and you ask for 4 indices in the strict compact mode, it will always be executed with indices from a single group. If you ask for 8 cpus in the same way, it will always be executed with indices from two groups. Example: $ hq submit --resource cpus = \"8 compact!\" ... ` Scatter ( scatter ) - Allocate indices across as many groups as possible in the current worker state. E.g. Let us assume that a worker has 4 groups with 8 indices per group, and you ask for 8 cpus in the scatter mode. If possible in the current situation, HQ tries to run process with 2 cpus on each socket. Example: $ hq submit --resource cpus = \"8 scatter\" ... Non-integer allocation of resources # Amount of the resource may be a non-integer number. E.g. you may ask for 0.5 of a resource. It tells the scheduler that you want to utilize only half of the resource and if another process asks for at most 0.5 of the resource, it may get the same resource. This resource sharing is done on logical of HyperQueue and actual resource sharing is up to tasks. The precision for defining amount is four decimal places. Therefore, the minimal resource amount that you can ask for is 0.0001 . For sum resources, the amount is simply removed from the pool as in the case of integer resources. In the case of an indexed resource, the partial resource is always taken from a single index. It means that if there is an indexed resource with two indices that are both utilized on 0.75, then a task that ask for 0.5 of this resource will not be started, despite there is available 0.5 of the resource in total, because there is no single index that is free at least on 0.5. If non-integer is bigger than 1, than integer part is always satisfied as whole indices and rest is a part of another index. E.g. when you ask for 2.5 of an indexed resource, you will get 2 complete indices and one index allocated on 50%. Note In the current version, policy \"compact!\" is not allowed with non-integer amounts. Resource environment variables # When a task that has resource requests is executed, the following variables are passed to it for each resource request named : HQ_RESOURCE_REQUEST_ contains the amount of requested resources. HQ_RESOURCE_VALUES_ contains the specific resource values allocated for the task as a comma-separated list. This variable is only filled for an indexed resource pool. In case of non-integer amount, the partially allocated index is always the last index. The slash symbol ( / ) in resource name is normalized to underscore ( _ ) when being used in the environment variable name. HQ also sets additional environment variables for various resources with special names: For the resource gpus/nvidia , HQ will set: CUDA_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_nvidia CUDA_DEVICE_ORDER to PCI_BUS_ID For the resource gpus/amd , HQ will set: ROCR_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_amd Resource requests and job arrays # Resource requests are applied to each task of job. For example, if you submit the following: $ hq submit --cpus = 2 --array = 1 -10 then each task will require two cores. Resource variants # A task may have attached more resource requests. There is no command line interface for this feature, but it can be configured through a Job Definition File .","title":"Resources"},{"location":"jobs/resources/#resource-management","text":"Resource management serves for defining arbitrary resources provided by workers and also corresponding resource requests required by tasks. HyperQueue will take care of matching task resource requests so that only workers that can fulfill them will be able to execute such tasks. Some generic resources are automatically detected ; however, users may also define their own resources. From version 0.13.0, CPUs are also managed as other resources, but they have still some extra functionality; therefore, there is a special section about CPU resources . Important Resources in HyperQueue exist on a purely logical level. They can correspond to physical things (like GPUs), but it is the responsibility of the user to make sure that this correspondence makes sense. With exception of CPUs, HyperQueue by itself does not attach any semantics to resources, they are just numbers used for scheduling.","title":"Resource management"},{"location":"jobs/resources/#worker-resources","text":"Each worker has one or mores resources attached. Each resource is a resource pool identified by a name. A resource pool represents some resources provided by a worker; each task can then ask for a part of the resources contained in that pool. There are two kinds of resource pools: Indexed pool : This pool represents an enumerated set of resources represented by strings. Each resource has its own identity. Tasks do not ask for specific values from the set, they just specify how many resources they require and HyperQueue will allocate the specified amount of resources from the pool for each task. This pool is useful for resources that have their own identity, for example individual GPU or FPGA accelerators. HyperQueue guarantees that no individual resource from the indexed pool is allocated to more than a single task at any given time and that a task will not be executed on a worker if it does not currently have enough individual resources to fulfill the resource request of the task. Indexed pool can be defined with groups where indices live in separated groups. Task may then ask for different allocation policies (e.g. use resources from the same or different groups). The main purpose of this is to capture NUMA architectures, each group then represents a socket with cores. Sum pool : This pool represents a resource that has a certain size which is split into individual tasks. A typical example is memory; if a worker has 2000 bytes of memory, it can serve e.g. four tasks, if each task asks for 500 bytes of memory. HyperQueue guarantees that the sum of resource request sizes of running tasks on a worker does not exceed the total size of the sum pool.","title":"Worker resources"},{"location":"jobs/resources/#specifying-worker-resources","text":"You can specify the resource pools of a worker when you start it: $ hq worker start --resource \"=\" --resource \"=\" ... where NAMEi is a name (string ) of the i -th resource pool and DEFi is a definition of the i-th resource pool. You can define resource pools using one of the following formats: [, , ..., ] where VALUE is a string. This defines a an indexed pool with the given values. If you need to enter a string resource that contains special characters ( [ , ] , , , whitespace), you can wrap the value in quotes: [\"foo [,]\", bar, \"my resource\"] . range(-) where START and END are non-negative integers. This defines an indexed pool with numbers in the inclusive range [START, END] . [[, ..., ], [, ..., ], ...] where VALUE is a string. This defines an indexed pool where indices are grouped. x Creates indexed pool with N groups of size M, indices are indexed from 0, (e.g. \"2x3\" is equivalent to [[0, 1, 2], [3, 4, 5] ) sum() where SIZE is a positive integer. This defines a sum pool with the given size. Tip You might encounter a problem in your shell when you try to specify worker resources, because the definition contains parentheses ( () ). In that case just wrap the resource definition in quotes, like this: $ hq worker start --resource \"foo=sum(5)\"","title":"Specifying worker resources"},{"location":"jobs/resources/#resource-names","text":"Resource names are restricted by the following rules: They can only contain ASCII letters and digits ( a-z , A-Z , 0-9 ) and the slash ( / ) symbol. They need to begin with an ASCII letter. These restrictions exist because the resource names are passed as environment variable names to tasks, which often execute shell scripts. However, shells typically do not support environment variables containing anything else than ASCII letters, digits and the underscore symbol. Therefore, HQ limits resource naming to align with the behaviour of the shell. Important HQ will normalize the resource name when passing environment variables to a task (see below ).","title":"Resource names"},{"location":"jobs/resources/#automatically-detected-resources","text":"The following resources are detected automatically if a resource of a given name is not explicitly defined. CPUs are automatically detected as resource named \"cpus\" (more in CPU resources ). GPUs that are available when a worker is started are automatically detected under the following resource names: NVIDIA GPUs are stored the under resource name gpus/nvidia . These GPUs are detected from the environment variable CUDA_VISIBLE_DEVICES or from the procfs filesystem. AMD GPUs are stored under the resource name gpus/amd . These GPUs are detected from the environment variable ROCR_VISIBLE_DEVICES . You can set these environment variables when starting a worker to override the list of available GPUs: $ CUDA_VISIBLE_DEVICES = 2 ,3 hq worker start # The worker will have resource gpus/nvidia=[2,3] RAM of the node is detected as resource \"mem\" in megabytes; i.e. --resource mem=100 asks for 100 MiBs of the memory. If you want to see how is your system seen by a worker without actually starting it, you can start: $ hq worker hwdetect The automatic detection of resources can be disabled by argument --no-detect-resources in hq worker start ... . It disables detection of resources other than \"cpus\"; if resource \"cpus\" are not explicitly defined, it will always be detected.","title":"Automatically detected resources"},{"location":"jobs/resources/#resource-request","text":"When you submit a job, you can define a resource requests with the --resource flag: $ hq submit --resource = --resource = ... Where NAME is a name of the requested resource and the AMOUNT is a positive number defining the size of the request. Tasks with such resource requests will only be executed on workers that fulfill all the specified task requests. Important Notice that task resource requests always ask for an amount of resources required by a task, regardless whether that resource corresponds to an indexed or a sum pool on workers. For example, let's say that a worker has an indexed pool of GPUs: $ hq worker start --resource \"gpus/nvidia=range(1-3)\" And we create two jobs, each with a single task. The first job wants 1 GPU, the second one wants two GPUs. $ hq submit --resource gpus/nvidia = 1 ... $ hq submit --resource gpus/nvidia = 2 ... Then the first job can be allocated e.g. the GPU 2 and the second job can be allocated the GPUs 1 and 3 .","title":"Resource request"},{"location":"jobs/resources/#requesting-all-resources","text":"A task may ask for all given resources of that type by specifying --resource =all . Such a task will be scheduled only on a worker that has at least 1 of such resource and when a task is executed all resources of that type will be given to this task.","title":"Requesting all resources"},{"location":"jobs/resources/#resource-request-strategies","text":"When resource request is defined, after the amount you can define allocation strategy: --resource =\" \" . Specifying strategy has effect only if worker provides indexed resource in groups. If resource is other type, then strategy is ignored. When strategy is not defined then compact is used as default. Compact ( compact ) - Tries to allocate indices in few groups as possible in the current worker state. Example: $ hq submit --resource cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocate indices on as few groups as possible for a target node. The task is not executed until the requirement could not be fully fulfilled. E.g. If a worker has 4 indices per a group and you ask for 4 indices in the strict compact mode, it will always be executed with indices from a single group. If you ask for 8 cpus in the same way, it will always be executed with indices from two groups. Example: $ hq submit --resource cpus = \"8 compact!\" ... ` Scatter ( scatter ) - Allocate indices across as many groups as possible in the current worker state. E.g. Let us assume that a worker has 4 groups with 8 indices per group, and you ask for 8 cpus in the scatter mode. If possible in the current situation, HQ tries to run process with 2 cpus on each socket. Example: $ hq submit --resource cpus = \"8 scatter\" ...","title":"Resource request strategies"},{"location":"jobs/resources/#non-integer-allocation-of-resources","text":"Amount of the resource may be a non-integer number. E.g. you may ask for 0.5 of a resource. It tells the scheduler that you want to utilize only half of the resource and if another process asks for at most 0.5 of the resource, it may get the same resource. This resource sharing is done on logical of HyperQueue and actual resource sharing is up to tasks. The precision for defining amount is four decimal places. Therefore, the minimal resource amount that you can ask for is 0.0001 . For sum resources, the amount is simply removed from the pool as in the case of integer resources. In the case of an indexed resource, the partial resource is always taken from a single index. It means that if there is an indexed resource with two indices that are both utilized on 0.75, then a task that ask for 0.5 of this resource will not be started, despite there is available 0.5 of the resource in total, because there is no single index that is free at least on 0.5. If non-integer is bigger than 1, than integer part is always satisfied as whole indices and rest is a part of another index. E.g. when you ask for 2.5 of an indexed resource, you will get 2 complete indices and one index allocated on 50%. Note In the current version, policy \"compact!\" is not allowed with non-integer amounts.","title":"Non-integer allocation of resources"},{"location":"jobs/resources/#resource-environment-variables","text":"When a task that has resource requests is executed, the following variables are passed to it for each resource request named : HQ_RESOURCE_REQUEST_ contains the amount of requested resources. HQ_RESOURCE_VALUES_ contains the specific resource values allocated for the task as a comma-separated list. This variable is only filled for an indexed resource pool. In case of non-integer amount, the partially allocated index is always the last index. The slash symbol ( / ) in resource name is normalized to underscore ( _ ) when being used in the environment variable name. HQ also sets additional environment variables for various resources with special names: For the resource gpus/nvidia , HQ will set: CUDA_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_nvidia CUDA_DEVICE_ORDER to PCI_BUS_ID For the resource gpus/amd , HQ will set: ROCR_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_amd","title":"Resource environment variables"},{"location":"jobs/resources/#resource-requests-and-job-arrays","text":"Resource requests are applied to each task of job. For example, if you submit the following: $ hq submit --cpus = 2 --array = 1 -10 then each task will require two cores.","title":"Resource requests and job arrays"},{"location":"jobs/resources/#resource-variants","text":"A task may have attached more resource requests. There is no command line interface for this feature, but it can be configured through a Job Definition File .","title":"Resource variants"},{"location":"jobs/streaming/","text":"Jobs containing many tasks will generate a large amount of stdout and stderr files, which can be problematic, especially on network-based shared filesystems, such as Lustre. For example, when you submit the following task array: $ hq submit --array = 1 -10000 my-computation.sh 20000 files ( 10000 for stdout and 10000 for stderr) will be created on the disk. To avoid this situation, HyperQueue can optionally stream the stdout and stderr output of tasks into a compact format that do not create a file per task. Note In this section, we refer to stdout and stderr as channels . Redirecting output to the stream # You can redirect the output of stdout and stderr to a log file and thus enable output streaming by passing a path to a filename where the log will be stored with the --stream option: $ hq submit --stream= --array=1-10_000 ... Output log path has to be a directory and it the user responsibility to ensure existence of the directory and visibility of each worker. This command would cause the stdout and stderr of all 10_000 tasks to be streamed into the server, which will write them to files in . The streamed data is written in a compact way independently on the number of tasks. The format also contains additional metadata, which allows the resulting file to be filtered/sorted by tasks or channel. Tip You can use selected placeholders inside the stream path. Partial redirection # By default, both stdout and stderr will be streamed if you specify --stream and do not specify an explicit path for stdout and stderr . To stream only one of the channels, you can use the --stdout / --stderr options to redirect one of them to a file or to disable it completely. For example: # Redirecting stdout into a file, streaming stderr into `my-log` $ hq submit --stream = my-log --stdout = \"stdout-%{TASK_ID}\" ... # Streaming stdout into `my-log`, disabling stderr $ hq submit --stream = my-log --stderr = none ... Guarantees # HyperQueue provides the following guarantees regarding output streaming: When a task is Finished or Failed it is guaranteed that all data produced by the task is flushed into the streaming file. With the following two exceptions: If the streaming itself fails (e.g. because there was insufficient disk space for the stream file), then the task will fail with an error prefixed with \"Streamer:\" and no streaming guarantees will be upheld. When a task is Canceled or task fails because of time limit is reached, then the part of its stream that was buffered in the worker is dropped to avoid spending additional resources for this task. Inspecting the stream files # HyperQueue lets you inspect the data stored inside the stream file using various subcommands. All these commands have the following structure: $ hq output-log Stream summary # You can display a summary of a log file using the summary subcommand: $ hq output-log summary Stream jobs # To print all job IDs that streaming in the stream path, you can run the following command: $ hq output-log jobs Printing stream content # If you want to simply print the (textual) content of the log file, without any associating metadata, you can use the cat subcommand: $ hq output-log cat It will print the raw content of either stdout or stderr , ordered by task id. All outputs will be concatenated one after another. You can use this to process the streamed data e.g. by a postprocessing script. By default, this command will fail if there is an unfinished stream (i.e. when some task is still running and streaming data into the log). If you want to use cat even when the log is not finished yet, use the --allow-unfinished option. If you want to see the output of a specific task, you can use the --task= option. Stream metadata # If you want to inspect the contents of the log, along with its inner metadata that shows which task and which channel has produced which part of the data, you can use the show subcommand: $ hq output-log show The output will have the form J.T:C> DATA where J is a job id, T is a task id and C is 0 for stdout channel and 1 for stderr channel. You can filter a specific channel with the --channel=stdout/stderr flag. Exporting log # Log can be exported into JSON by the following command: $ hq output-log export This prints the log file into a JSON format on standard output. Superseded streams # When a worker crashes while executing a task, the task will be restarted . HyperQueue gives each run of task a difference INSTANCE_ID, and it is a part of stream metadata, hence HyperQueue streaming is able to avoid mixing outputs from different executions of the same task, when a task is restarted. HyperQueue automatically marks all output from previous instance of a task except the last instance as superseded . You can see statistics about superseded data via hq output-log summary command. In the current version, superseded data is ignored by all other commands. More server instances # HyperQueue supports writing streams from the different server instances into the same directory. If you run hq output-log commands over such directory then it will detect the situation and prints all server uids that writes into the directory. You have to specify the server instance via hq output-log --server-uid= ... when working with such a output log directory. Note When a server is restored from a journal file, it will maintain the same server UID. When a server is started \"from a scratch\" a new server uid is generated. Working with non-shared file system # You do not need to have a shared file system when working with streaming. It is just your responsibility to collect all generated files into one directory before using hq output-log commands.","title":"Output Streaming"},{"location":"jobs/streaming/#redirecting-output-to-the-stream","text":"You can redirect the output of stdout and stderr to a log file and thus enable output streaming by passing a path to a filename where the log will be stored with the --stream option: $ hq submit --stream= --array=1-10_000 ... Output log path has to be a directory and it the user responsibility to ensure existence of the directory and visibility of each worker. This command would cause the stdout and stderr of all 10_000 tasks to be streamed into the server, which will write them to files in . The streamed data is written in a compact way independently on the number of tasks. The format also contains additional metadata, which allows the resulting file to be filtered/sorted by tasks or channel. Tip You can use selected placeholders inside the stream path.","title":"Redirecting output to the stream"},{"location":"jobs/streaming/#partial-redirection","text":"By default, both stdout and stderr will be streamed if you specify --stream and do not specify an explicit path for stdout and stderr . To stream only one of the channels, you can use the --stdout / --stderr options to redirect one of them to a file or to disable it completely. For example: # Redirecting stdout into a file, streaming stderr into `my-log` $ hq submit --stream = my-log --stdout = \"stdout-%{TASK_ID}\" ... # Streaming stdout into `my-log`, disabling stderr $ hq submit --stream = my-log --stderr = none ...","title":"Partial redirection"},{"location":"jobs/streaming/#guarantees","text":"HyperQueue provides the following guarantees regarding output streaming: When a task is Finished or Failed it is guaranteed that all data produced by the task is flushed into the streaming file. With the following two exceptions: If the streaming itself fails (e.g. because there was insufficient disk space for the stream file), then the task will fail with an error prefixed with \"Streamer:\" and no streaming guarantees will be upheld. When a task is Canceled or task fails because of time limit is reached, then the part of its stream that was buffered in the worker is dropped to avoid spending additional resources for this task.","title":"Guarantees"},{"location":"jobs/streaming/#inspecting-the-stream-files","text":"HyperQueue lets you inspect the data stored inside the stream file using various subcommands. All these commands have the following structure: $ hq output-log ","title":"Inspecting the stream files"},{"location":"jobs/streaming/#stream-summary","text":"You can display a summary of a log file using the summary subcommand: $ hq output-log summary","title":"Stream summary"},{"location":"jobs/streaming/#stream-jobs","text":"To print all job IDs that streaming in the stream path, you can run the following command: $ hq output-log jobs","title":"Stream jobs"},{"location":"jobs/streaming/#printing-stream-content","text":"If you want to simply print the (textual) content of the log file, without any associating metadata, you can use the cat subcommand: $ hq output-log cat It will print the raw content of either stdout or stderr , ordered by task id. All outputs will be concatenated one after another. You can use this to process the streamed data e.g. by a postprocessing script. By default, this command will fail if there is an unfinished stream (i.e. when some task is still running and streaming data into the log). If you want to use cat even when the log is not finished yet, use the --allow-unfinished option. If you want to see the output of a specific task, you can use the --task= option.","title":"Printing stream content"},{"location":"jobs/streaming/#stream-metadata","text":"If you want to inspect the contents of the log, along with its inner metadata that shows which task and which channel has produced which part of the data, you can use the show subcommand: $ hq output-log show The output will have the form J.T:C> DATA where J is a job id, T is a task id and C is 0 for stdout channel and 1 for stderr channel. You can filter a specific channel with the --channel=stdout/stderr flag.","title":"Stream metadata"},{"location":"jobs/streaming/#exporting-log","text":"Log can be exported into JSON by the following command: $ hq output-log export This prints the log file into a JSON format on standard output.","title":"Exporting log"},{"location":"jobs/streaming/#superseded-streams","text":"When a worker crashes while executing a task, the task will be restarted . HyperQueue gives each run of task a difference INSTANCE_ID, and it is a part of stream metadata, hence HyperQueue streaming is able to avoid mixing outputs from different executions of the same task, when a task is restarted. HyperQueue automatically marks all output from previous instance of a task except the last instance as superseded . You can see statistics about superseded data via hq output-log summary command. In the current version, superseded data is ignored by all other commands.","title":"Superseded streams"},{"location":"jobs/streaming/#more-server-instances","text":"HyperQueue supports writing streams from the different server instances into the same directory. If you run hq output-log commands over such directory then it will detect the situation and prints all server uids that writes into the directory. You have to specify the server instance via hq output-log --server-uid= ... when working with such a output log directory. Note When a server is restored from a journal file, it will maintain the same server UID. When a server is started \"from a scratch\" a new server uid is generated.","title":"More server instances"},{"location":"jobs/streaming/#working-with-non-shared-file-system","text":"You do not need to have a shared file system when working with streaming. It is just your responsibility to collect all generated files into one directory before using hq output-log commands.","title":"Working with non-shared file system"},{"location":"python/","text":"Python API # To provide greater flexibility and support use-cases that are difficult to express using the CLI such as dynamically submitting tasks when some part is finished. Python API covers all task definition including all options available through Job Definition File (dependencies between tasks, resource variants, etc) You can find the HyperQueue Python API reference here . Requirements # To use the Python API, you will need at least Python 3.6 and some dependencies that will be installed automatically using pip. Installation # You can install the HyperQueue Python API from PyPi with the following command: $ python3 -m pip install hyperqueue The Python package contains a pre-compiled version of HyperQueue, so you do not have to download hq manually if you just want to use the Python API. Warning The Python API is currently distributed only for the x86-x64 architecture. If you need a build for another architecture, please contact us on GitHub. You can also build the Python package manually from our GitHub repository, but you will need to install a Rust toolchain for that. Quick start # Here is a minimal code example that spawns a local HyperQueue cluster and uses it to submit a simple job: from hyperqueue import Job , LocalCluster # Spawn a HQ server with LocalCluster () as cluster : # Add a single HyperQueue worker to the server cluster . start_worker () # Create a client and a job client = cluster . client () job = Job () # Add a task that executes `ls` to the job job . program ([ \"ls\" ]) # Submit the job submitted = client . submit ( job ) # Wait until the job completes client . wait_for_jobs ([ submitted ])","title":"Getting started"},{"location":"python/#python-api","text":"To provide greater flexibility and support use-cases that are difficult to express using the CLI such as dynamically submitting tasks when some part is finished. Python API covers all task definition including all options available through Job Definition File (dependencies between tasks, resource variants, etc) You can find the HyperQueue Python API reference here .","title":"Python API"},{"location":"python/#requirements","text":"To use the Python API, you will need at least Python 3.6 and some dependencies that will be installed automatically using pip.","title":"Requirements"},{"location":"python/#installation","text":"You can install the HyperQueue Python API from PyPi with the following command: $ python3 -m pip install hyperqueue The Python package contains a pre-compiled version of HyperQueue, so you do not have to download hq manually if you just want to use the Python API. Warning The Python API is currently distributed only for the x86-x64 architecture. If you need a build for another architecture, please contact us on GitHub. You can also build the Python package manually from our GitHub repository, but you will need to install a Rust toolchain for that.","title":"Installation"},{"location":"python/#quick-start","text":"Here is a minimal code example that spawns a local HyperQueue cluster and uses it to submit a simple job: from hyperqueue import Job , LocalCluster # Spawn a HQ server with LocalCluster () as cluster : # Add a single HyperQueue worker to the server cluster . start_worker () # Create a client and a job client = cluster . client () job = Job () # Add a task that executes `ls` to the job job . program ([ \"ls\" ]) # Submit the job submitted = client . submit ( job ) # Wait until the job completes client . wait_for_jobs ([ submitted ])","title":"Quick start"},{"location":"python/client/","text":"Client # To submit jobs using the Python API, you first need to create a Client that connects to a running HyperQueue cluster. You have two options of deploying the cluster. Once you have an instance of a Client , you can use it to submit a job. Using external deployment # If you want to run the HyperQueue infrastructure on a distributed cluster or you want to use automatic allocation , then deploy HyperQueue in any of the supported ways and then pass the server directory to the Client : from hyperqueue import Client client = Client ( \"/home/user/.hq-server/hq-current\" ) If you have used the default server directory and the server is deployed on a file-system shared by the node that executes the Python code, you can simply create an instance of a Client without passing any parameters. Using a local cluster # You can use the LocalCluster class to spawn a HyperQueue server and a set of workers directly on your local machine. This functionality is primarily intended for local prototyping and debugging, but it can also be used for actual computations for simple use-cases that do not require a distributed deployment of HyperQueue. When you create the cluster, it will initially only start the HyperQueue server. To connect workers to it, use the start_worker method. from hyperqueue import LocalCluster from hyperqueue.cluster import WorkerConfig with LocalCluster () as cluster : # Add a worker with 4 cores to the cluster cluster . start_worker ( WorkerConfig ( cores = 4 )) # Create a client connected to the cluster client = cluster . client () Tip You can use LocalCluster instances as context managers to make sure that the cluster is properly cleaned up at the end of the with block.","title":"Client"},{"location":"python/client/#client","text":"To submit jobs using the Python API, you first need to create a Client that connects to a running HyperQueue cluster. You have two options of deploying the cluster. Once you have an instance of a Client , you can use it to submit a job.","title":"Client"},{"location":"python/client/#using-external-deployment","text":"If you want to run the HyperQueue infrastructure on a distributed cluster or you want to use automatic allocation , then deploy HyperQueue in any of the supported ways and then pass the server directory to the Client : from hyperqueue import Client client = Client ( \"/home/user/.hq-server/hq-current\" ) If you have used the default server directory and the server is deployed on a file-system shared by the node that executes the Python code, you can simply create an instance of a Client without passing any parameters.","title":"Using external deployment"},{"location":"python/client/#using-a-local-cluster","text":"You can use the LocalCluster class to spawn a HyperQueue server and a set of workers directly on your local machine. This functionality is primarily intended for local prototyping and debugging, but it can also be used for actual computations for simple use-cases that do not require a distributed deployment of HyperQueue. When you create the cluster, it will initially only start the HyperQueue server. To connect workers to it, use the start_worker method. from hyperqueue import LocalCluster from hyperqueue.cluster import WorkerConfig with LocalCluster () as cluster : # Add a worker with 4 cores to the cluster cluster . start_worker ( WorkerConfig ( cores = 4 )) # Create a client connected to the cluster client = cluster . client () Tip You can use LocalCluster instances as context managers to make sure that the cluster is properly cleaned up at the end of the with block.","title":"Using a local cluster"},{"location":"python/dependencies/","text":"Task dependencies # One of the most useful features of the HyperQueue Python API is that it allows you to define dependencies between individual tasks of a job. If a task B depends on task A , then B will not be executed until A has (successfully) finished. Using dependencies, you can describe arbitrarily complex DAG (directed acyclic graph) workflows. Notice HyperQueue jobs are independent of each other, so dependencies can only be specified between tasks within a single job. Defining dependencies # To define a dependency between tasks, you will first need to store the Task instances that you get when you create a task . You can then use the deps parameter when creating a new task and pass an existing task instance to define a dependency: from hyperqueue import Job job = Job () # Create a first task that generates data task_a = job . program ([ \"generate-data\" , \"--file\" , \"out.txt\" ]) # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out.txt\" ], deps = [ task_a ]) The second task will not be started until the first one successfully finishes. You can also depend on multiple tasks at once: # Create several tasks that generate data tasks = [ job . program ([ \"generate-data\" , \"--file\" , f \"out- { i } .txt\" ]) for i in range ( 5 )] # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out- %d .txt\" ], deps = [ tasks ]) Dependencies are transitive, so you can build an arbitrary graph: task_a = job . program ([ \"generate\" , \"1\" ]) task_b = job . program ([ \"generate\" , \"2\" ]) task_c = job . program ([ \"compute\" ], deps = [ task_a , task_b ]) task_d = job . program ([ \"postprocess\" ], deps = [ task_c ]) In this case, task D will not start until all the three previous tasks are successfully finished.","title":"Dependencies"},{"location":"python/dependencies/#task-dependencies","text":"One of the most useful features of the HyperQueue Python API is that it allows you to define dependencies between individual tasks of a job. If a task B depends on task A , then B will not be executed until A has (successfully) finished. Using dependencies, you can describe arbitrarily complex DAG (directed acyclic graph) workflows. Notice HyperQueue jobs are independent of each other, so dependencies can only be specified between tasks within a single job.","title":"Task dependencies"},{"location":"python/dependencies/#defining-dependencies","text":"To define a dependency between tasks, you will first need to store the Task instances that you get when you create a task . You can then use the deps parameter when creating a new task and pass an existing task instance to define a dependency: from hyperqueue import Job job = Job () # Create a first task that generates data task_a = job . program ([ \"generate-data\" , \"--file\" , \"out.txt\" ]) # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out.txt\" ], deps = [ task_a ]) The second task will not be started until the first one successfully finishes. You can also depend on multiple tasks at once: # Create several tasks that generate data tasks = [ job . program ([ \"generate-data\" , \"--file\" , f \"out- { i } .txt\" ]) for i in range ( 5 )] # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out- %d .txt\" ], deps = [ tasks ]) Dependencies are transitive, so you can build an arbitrary graph: task_a = job . program ([ \"generate\" , \"1\" ]) task_b = job . program ([ \"generate\" , \"2\" ]) task_c = job . program ([ \"compute\" ], deps = [ task_a , task_b ]) task_d = job . program ([ \"postprocess\" ], deps = [ task_c ]) In this case, task D will not start until all the three previous tasks are successfully finished.","title":"Defining dependencies"},{"location":"python/submit/","text":"Submitting jobs # You can use the Python API to submit jobs (directed acyclic graphs of tasks) through a Client . In addition to the functionality offered by the HyperQueue CLI, you can use the Python API to add dependencies between jobs, configure each task individually and create tasks out of Python functions . Job # To build a job, you first have to create an instance of the Job class. from hyperqueue import Job job = Job () Tasks # Once you have created a job, you can add tasks to it. Currently, each task can represent either the execution of an external program or the execution of a Python function . To create complex workflows, you can also specify dependencies between tasks. External programs # To create a task that will execute an external program, you can use the program method of a Job : job . program ([ \"/bin/my-program\" , \"foo\" , \"bar\" , \"--arg\" , \"42\" ]) You can pass the program arguments or various other parameters to the task. The program method will return a Task object that represents the created task. This object can be used further e.g. for defining dependencies . Python functions # If you want to execute a Python function as a task, you can use the function method of a Job : def preprocess_data ( fast , path ): with open ( path ) as f : data = f . read () if fast : preprocess_fast ( data ) else : preprocess ( data ) job . function ( preprocess_data , args = ( True , \"/data/a.txt\" )) job . function ( preprocess_data , args = ( False , \"/data/b.txt\" )) You can pass both positional and keyword arguments to the function. The arguments will be serialized using cloudpickle . Python tasks can be useful to perform e.g. various data preprocessing and organization tasks. You can co-locate the logic of Python tasks together with the code that defines the submitted workflow (job), without the need to write an additional external script. Same as with the program method, function will return a Task that can used to define dependencies . Notice Currently, a new Python interpreter will be started for each Python task. Python environment # When you use a Python function as a task, the task will attempt to import the hyperqueue package when it executes (to perform some bookkeeping on the background). This function will be executed on a worker - this means that it needs to have access to the correct Python version (and virtual environment) that contains the hyperqueue package! To make sure that the function will be executed in the correct Python environment, you can use PythonEnv and its prologue argument. It lets you specify a (shell) command that will be executed before the Python interpreter that executes your function is spawned. from hyperqueue.task.function import PythonEnv from hyperqueue import Client env = PythonEnv ( prologue = \"ml Python/XYZ && source //bin/activate\" ) client = Client ( python_env = env ) If you use Python functions as tasks, it is pretty much required to use PythonEnv , unless your workers are already spawned in an environment that has the correct Python loaded (e.g. using .bashrc or a similar mechanism). Parametrizing tasks # You can parametrize both external or Python tasks by setting their working directory, standard output paths, environment variables or HyperQueue specific parameters like resources or time limits . In contrast to the CLI, where you can only use a single set of parameters for all tasks of a job, with the Python API you can specify these parameters individually for each task. You can find more details in the documentation of the program or function methods. Submitting a job # Once you have added some tasks to the job, you can submit it using the Client 's submit method: client = Client () submitted = client . submit ( job ) To wait until the job has finished executing, use the wait_for_jobs method: client . wait_for_jobs ([ submitted ])","title":"Submitting jobs"},{"location":"python/submit/#submitting-jobs","text":"You can use the Python API to submit jobs (directed acyclic graphs of tasks) through a Client . In addition to the functionality offered by the HyperQueue CLI, you can use the Python API to add dependencies between jobs, configure each task individually and create tasks out of Python functions .","title":"Submitting jobs"},{"location":"python/submit/#job","text":"To build a job, you first have to create an instance of the Job class. from hyperqueue import Job job = Job ()","title":"Job"},{"location":"python/submit/#tasks","text":"Once you have created a job, you can add tasks to it. Currently, each task can represent either the execution of an external program or the execution of a Python function . To create complex workflows, you can also specify dependencies between tasks.","title":"Tasks"},{"location":"python/submit/#external-programs","text":"To create a task that will execute an external program, you can use the program method of a Job : job . program ([ \"/bin/my-program\" , \"foo\" , \"bar\" , \"--arg\" , \"42\" ]) You can pass the program arguments or various other parameters to the task. The program method will return a Task object that represents the created task. This object can be used further e.g. for defining dependencies .","title":"External programs"},{"location":"python/submit/#python-functions","text":"If you want to execute a Python function as a task, you can use the function method of a Job : def preprocess_data ( fast , path ): with open ( path ) as f : data = f . read () if fast : preprocess_fast ( data ) else : preprocess ( data ) job . function ( preprocess_data , args = ( True , \"/data/a.txt\" )) job . function ( preprocess_data , args = ( False , \"/data/b.txt\" )) You can pass both positional and keyword arguments to the function. The arguments will be serialized using cloudpickle . Python tasks can be useful to perform e.g. various data preprocessing and organization tasks. You can co-locate the logic of Python tasks together with the code that defines the submitted workflow (job), without the need to write an additional external script. Same as with the program method, function will return a Task that can used to define dependencies . Notice Currently, a new Python interpreter will be started for each Python task.","title":"Python functions"},{"location":"python/submit/#python-environment","text":"When you use a Python function as a task, the task will attempt to import the hyperqueue package when it executes (to perform some bookkeeping on the background). This function will be executed on a worker - this means that it needs to have access to the correct Python version (and virtual environment) that contains the hyperqueue package! To make sure that the function will be executed in the correct Python environment, you can use PythonEnv and its prologue argument. It lets you specify a (shell) command that will be executed before the Python interpreter that executes your function is spawned. from hyperqueue.task.function import PythonEnv from hyperqueue import Client env = PythonEnv ( prologue = \"ml Python/XYZ && source //bin/activate\" ) client = Client ( python_env = env ) If you use Python functions as tasks, it is pretty much required to use PythonEnv , unless your workers are already spawned in an environment that has the correct Python loaded (e.g. using .bashrc or a similar mechanism).","title":"Python environment"},{"location":"python/submit/#parametrizing-tasks","text":"You can parametrize both external or Python tasks by setting their working directory, standard output paths, environment variables or HyperQueue specific parameters like resources or time limits . In contrast to the CLI, where you can only use a single set of parameters for all tasks of a job, with the Python API you can specify these parameters individually for each task. You can find more details in the documentation of the program or function methods.","title":"Parametrizing tasks"},{"location":"python/submit/#submitting-a-job","text":"Once you have added some tasks to the job, you can submit it using the Client 's submit method: client = Client () submitted = client . submit ( job ) To wait until the job has finished executing, use the wait_for_jobs method: client . wait_for_jobs ([ submitted ])","title":"Submitting a job"}]} \ No newline at end of file +{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"HyperQueue is a tool designed to simplify execution of large workflows (task graphs) on HPC clusters. It allows you to execute a large number of tasks in a simple way, without having to manually submit jobs into batch schedulers like Slurm or PBS. You just specify what you want to compute \u2013 HyperQueue will automatically ask for computational resources and dynamically load-balance tasks across all allocated nodes and cores. HyperQueue can also work without Slurm/PBS as a general task executor. If you use HyperQueue in your research, please consider citing it . Useful links # Installation Quick start Python API Repository Discussion forum Zulip (chat platform) Features # Resource management Batch jobs are submitted and managed automatically Computation is distributed amongst all allocated nodes and cores Tasks can specify resource requirements (# of cores, GPUs, memory, ...) Performance Scales to millions of tasks and hundreds of nodes Overhead per task is around 0.1 ms Task output can be streamed to a single file to avoid overloading distributed filesystems Simple deployment HQ is provided as a single, statically linked binary without any dependencies No admin access to a cluster is needed","title":"Overview"},{"location":"#useful-links","text":"Installation Quick start Python API Repository Discussion forum Zulip (chat platform)","title":"Useful links"},{"location":"#features","text":"Resource management Batch jobs are submitted and managed automatically Computation is distributed amongst all allocated nodes and cores Tasks can specify resource requirements (# of cores, GPUs, memory, ...) Performance Scales to millions of tasks and hundreds of nodes Overhead per task is around 0.1 ms Task output can be streamed to a single file to avoid overloading distributed filesystems Simple deployment HQ is provided as a single, statically linked binary without any dependencies No admin access to a cluster is needed","title":"Features"},{"location":"cheatsheet/","text":"Cheatsheet # Here you can find a cheatsheet with the most basic HQ commands.","title":"Cheatsheet"},{"location":"cheatsheet/#cheatsheet","text":"Here you can find a cheatsheet with the most basic HQ commands.","title":"Cheatsheet"},{"location":"faq/","text":"FAQ # Here you can find a list of frequently asked questions about HyperQueue. If you'd like to ask about anything related to HyperQueue, feel free to ask on our discussion forum or on our Zulip server . HQ fundamentals # How does HQ work? You start a HQ server somewhere (e.g. a login node or a cloud partition of a cluster). Then you can submit your jobs containing tasks to the server. You may have hundreds of thousands of tasks; they may have various CPUs and other resource requirements. Then you can connect any number of HQ workers to the server (either manually or via SLURM/PBS). The server will then immediately start to assign tasks to them. Workers are fully and dynamically controlled by server; you do not need to specify what tasks are executed on a particular worker or preconfigure it in any way. HQ provides a command line tool for submitting and controlling jobs. What is a task in HQ? Task is a unit of computation. Currently, it is either the execution of an arbitrary external program (specified via CLI) or the execution of a single Python function (specified via our Python API). What is a job in HQ? Job is a collection of tasks (a task graph). You can display and manage jobs using the CLI. What operating systems does HQ support? HyperQueue currently only officially supports Linux. It might be possible to compile it for other operating systems, however we do not provide any support nor promise to fix any bugs for other operating systems. How to deploy HQ? HQ is distributed as a single, self-contained and statically linked binary. It allows you to start the server, the workers, and it also serves as CLI for submitting and controlling jobs. No other services are needed. How many jobs/tasks may I submit into HQ? Our preliminary benchmarks show that the overhead of HQ is around 0.1 ms per task. It should be thus possible to submit a job with tens or hundreds of thousands tasks into HQ. Note that HQ is designed for a large number of tasks, not jobs. If you want to perform a lot of computations, use task arrays , i.e. create a job with many tasks, not many jobs each with a single task. HQ also supports streaming of task outputs into a single file. This avoids creating many small files for each task on a distributed file system, which improves scaling. Does HQ support multi-CPU tasks? Yes. You can define an arbitrary amount of CPUs for each task. HQ is also NUMA aware and you can select the NUMA allocation strategy. Does HQ support job/task arrays? Yes, see task arrays . Does HQ support tasks with dependencies? Yes, although it is currently only implemented in the Python API, which is experimental. It is currently not possible to specify dependencies using the CLI. How is HQ implemented? HQ is implemented in Rust and uses Tokio ecosystem. The scheduler is work-stealing scheduler implemented in our project Tako , that is derived from our previous work RSDS . Integration tests are written in Python, but HQ itself does not depend on Python. Relation to HPC technologies # Do I need to SLURM or PBS to run HQ? No. Even though HQ is designed to work smoothly on systems using SLURM/PBS, they are not required in order for HQ to work. Is HQ a replacement for SLURM or PBS? Definitely not. Multi-tenancy is out of the scope of HQ, i.e. HQ does not provide user isolation. HQ is light-weight and easy to deploy; on an HPC system each user (or a group of users that trust each other) may run their own instance of HQ. Do I need an HPC cluster to run HQ? No. None of functionality is bound to any HPC technology. Communication between all components is performed using TCP/IP. You can also run HQ locally on your personal computer. Is it safe to run HQ on a login node shared by other users? Yes. All communication is secured and encrypted. The server generates a secret file and only those users that have access to that file may submit jobs and connect workers. Users without access to the secret file will only see that the service is running. Performance should also not be a concern. Our experiments show that the server consumes only ~0.3ms of CPU time every second per a thousand tasks executed. Relation to other task runtimes # How does HQ differ from SnakeMake/Dask/Merlin/...? You can find a comparison of HQ with similar tools here .","title":"FAQ"},{"location":"faq/#faq","text":"Here you can find a list of frequently asked questions about HyperQueue. If you'd like to ask about anything related to HyperQueue, feel free to ask on our discussion forum or on our Zulip server .","title":"FAQ"},{"location":"faq/#hq-fundamentals","text":"How does HQ work? You start a HQ server somewhere (e.g. a login node or a cloud partition of a cluster). Then you can submit your jobs containing tasks to the server. You may have hundreds of thousands of tasks; they may have various CPUs and other resource requirements. Then you can connect any number of HQ workers to the server (either manually or via SLURM/PBS). The server will then immediately start to assign tasks to them. Workers are fully and dynamically controlled by server; you do not need to specify what tasks are executed on a particular worker or preconfigure it in any way. HQ provides a command line tool for submitting and controlling jobs. What is a task in HQ? Task is a unit of computation. Currently, it is either the execution of an arbitrary external program (specified via CLI) or the execution of a single Python function (specified via our Python API). What is a job in HQ? Job is a collection of tasks (a task graph). You can display and manage jobs using the CLI. What operating systems does HQ support? HyperQueue currently only officially supports Linux. It might be possible to compile it for other operating systems, however we do not provide any support nor promise to fix any bugs for other operating systems. How to deploy HQ? HQ is distributed as a single, self-contained and statically linked binary. It allows you to start the server, the workers, and it also serves as CLI for submitting and controlling jobs. No other services are needed. How many jobs/tasks may I submit into HQ? Our preliminary benchmarks show that the overhead of HQ is around 0.1 ms per task. It should be thus possible to submit a job with tens or hundreds of thousands tasks into HQ. Note that HQ is designed for a large number of tasks, not jobs. If you want to perform a lot of computations, use task arrays , i.e. create a job with many tasks, not many jobs each with a single task. HQ also supports streaming of task outputs into a single file. This avoids creating many small files for each task on a distributed file system, which improves scaling. Does HQ support multi-CPU tasks? Yes. You can define an arbitrary amount of CPUs for each task. HQ is also NUMA aware and you can select the NUMA allocation strategy. Does HQ support job/task arrays? Yes, see task arrays . Does HQ support tasks with dependencies? Yes, although it is currently only implemented in the Python API, which is experimental. It is currently not possible to specify dependencies using the CLI. How is HQ implemented? HQ is implemented in Rust and uses Tokio ecosystem. The scheduler is work-stealing scheduler implemented in our project Tako , that is derived from our previous work RSDS . Integration tests are written in Python, but HQ itself does not depend on Python.","title":"HQ fundamentals"},{"location":"faq/#relation-to-hpc-technologies","text":"Do I need to SLURM or PBS to run HQ? No. Even though HQ is designed to work smoothly on systems using SLURM/PBS, they are not required in order for HQ to work. Is HQ a replacement for SLURM or PBS? Definitely not. Multi-tenancy is out of the scope of HQ, i.e. HQ does not provide user isolation. HQ is light-weight and easy to deploy; on an HPC system each user (or a group of users that trust each other) may run their own instance of HQ. Do I need an HPC cluster to run HQ? No. None of functionality is bound to any HPC technology. Communication between all components is performed using TCP/IP. You can also run HQ locally on your personal computer. Is it safe to run HQ on a login node shared by other users? Yes. All communication is secured and encrypted. The server generates a secret file and only those users that have access to that file may submit jobs and connect workers. Users without access to the secret file will only see that the service is running. Performance should also not be a concern. Our experiments show that the server consumes only ~0.3ms of CPU time every second per a thousand tasks executed.","title":"Relation to HPC technologies"},{"location":"faq/#relation-to-other-task-runtimes","text":"How does HQ differ from SnakeMake/Dask/Merlin/...? You can find a comparison of HQ with similar tools here .","title":"Relation to other task runtimes"},{"location":"installation/","text":"Binary distribution (recommended) # The easiest way to install HyperQueue is to download and unpack the prebuilt hq executable: Download the latest release archive from this link . Target architecture Make sure to choose the correct binary for your architecture. Currently, we provide prebuilt binaries for x86-64 and PowerPC architectures. Unpack the downloaded archive: $ tar -xvzf hq--linux-.tar.gz The archive contains a single binary hq , which is used both for deploying the HQ cluster and submitting tasks into HQ . You can add hq to your system $PATH to make its usage easier. See Quickstart for an example \"Hello world\" HyperQueue computation. Compilation from source code # You can also compile HyperQueue from source. This allows you to build HyperQueue for architectures for which we do not provide prebuilt binaries. It can also generate binaries with support for vectorization, which could in theory improve the performance of HyperQueue in extreme cases. Setup a Rust toolchain Clone the HyperQueue repository: $ git clone https://github.com/It4innovations/hyperqueue/ Build HyperQueue: $ RUSTFLAGS = \"-C target-cpu=native\" cargo build --release Jemalloc dependency HyperQueue by default depends on the Jemalloc memory allocator, which is a C library. If you're having problems with installing HyperQueue because of this dependency, you can opt-out of it and use the default system allocator by building HQ with --no-default-features : $ cargo build --release --no-default-features Use the executable located in ./target/release/hq","title":"Installation"},{"location":"installation/#binary-distribution-recommended","text":"The easiest way to install HyperQueue is to download and unpack the prebuilt hq executable: Download the latest release archive from this link . Target architecture Make sure to choose the correct binary for your architecture. Currently, we provide prebuilt binaries for x86-64 and PowerPC architectures. Unpack the downloaded archive: $ tar -xvzf hq--linux-.tar.gz The archive contains a single binary hq , which is used both for deploying the HQ cluster and submitting tasks into HQ . You can add hq to your system $PATH to make its usage easier. See Quickstart for an example \"Hello world\" HyperQueue computation.","title":"Binary distribution (recommended)"},{"location":"installation/#compilation-from-source-code","text":"You can also compile HyperQueue from source. This allows you to build HyperQueue for architectures for which we do not provide prebuilt binaries. It can also generate binaries with support for vectorization, which could in theory improve the performance of HyperQueue in extreme cases. Setup a Rust toolchain Clone the HyperQueue repository: $ git clone https://github.com/It4innovations/hyperqueue/ Build HyperQueue: $ RUSTFLAGS = \"-C target-cpu=native\" cargo build --release Jemalloc dependency HyperQueue by default depends on the Jemalloc memory allocator, which is a C library. If you're having problems with installing HyperQueue because of this dependency, you can opt-out of it and use the default system allocator by building HQ with --no-default-features : $ cargo build --release --no-default-features Use the executable located in ./target/release/hq","title":"Compilation from source code"},{"location":"other-tools/","text":"Comparison with other task runtimes # There is a very large number of different task runtimes, with various performance characteristics, feature sets, programming models and trade-offs, and it is of course infeasible to compare HyperQueue with all of them. One of HyperQueue's authors has written a PhD thesis titled Ergonomics and efficiency of workflows on HPC clusters , which includes a section that compares HyperQueue with several other tools. We invite you to examine this section (and the whole thesis) if you want to find out more about the relation of HyperQueue to other task runtimes. The descriptions of other task runtimes presented on this page are actual as of October 2024. Below you can find a table 1 , which compares selected features of twelve task runtimes that we have experience with and/or that we think are relevant for HyperQueue. You can find more information about the table in Section 7.6 of the thesis. Below we also provide opinionated 2 descriptions of selected task runtimes that we think can be reasonable compared with HyperQueue. GNU Parallel HyperShell Dask Ray Parsl PyCOMPSs Pegasus Balsam AutoSubmit FireWorks SnakeMake Merlin GNU Parallel # GNU Parallel is a command-line utility for executing many tasks in parallel on a set of computational nodes. It does not offer many advanced task runtime features, but it does one thing well; it enables a parallelized and even distributed execution of a set of programs with a single command invocation. HyperQueue takes inspiration from this approach, as it offers a CLI that can be used to execute task graphs with many tasks and complex resource requirements with a single command. HyperShell # HyperShell is primarily designed for executing many homogeneous tasks using the command-line. It does introduce several useful features on top of GNU Parallel , such as automatic task re-execution when a task fails and storing the task state in a database, which enables users to observe the history of executed workflows. It also provides a simple autoscaling functionality that automatically submits allocations. However, tasks in HyperShell are strictly tied to allocations; by default, one task is submitted in a single allocation. It does provide the option to bundle several tasks together, but users have to specify the maximum bundle size explicitly, which makes load balancing inflexible. HyperShell does not support task dependencies; therefore, it cannot be used to execute general task graphs. Dask # Dask is a task runtime that is very popular within the Python community, which allows executing arbitrary task graphs composed of Python functions on a distributed cluster. It also supports distributing code using numpy or pandas compatible API. While Dask by itself does not interact with PBS or Slurm, you can use Dask-JobQueue to make it operate in a similar fashion as HyperQueue - with the centralized server running on a login node and the workers running on compute nodes. Dask does not support arbitrary resource requirements and since it is written in Python, it can have problems with scaling to very large task graphs. If your use-case is primarily Python-based though, you should definitely give Dask a try, it's a great tool. Ray # Ray is a distributed task runtime primarily aimed at parallelizing the training and inference of machine learning models in Python. It uses a relatively unique architecture that leverages distributed scheduling; not all task submission and scheduling decisions need to go through a central location, unlike most other compared task runtimes including HyperQueue. This allows it to scale to an enormous amount of resources, millions of tasks and thousands of nodes. However, in order to enable this level of scalability, the workflow itself has to be implemented in a way where tasks submit new tasks from worker nodes dynamically. Therefore, batch computing use-cases that simply want to execute a predetermined workflow might be unable to achieve such high performance. Same as Dask , it offers basic resource requirements and it also supports fractional resources and related resource groups. However, it does not allow expressing multi-node tasks. In contrast to Dask, it is internally implemented in C++ , which introduces much less overhead than Python. Even though Ray provides some autoscaling functionality, it does not support Slurm or other HPC allocation managers. In general, it is not specialized for HPC idiosyncrasies nor for executing arbitrary task graphs; even though it has a low-level interface for creating tasks through Python functions, it primarily focuses on generating task graphs automatically from high-level descriptions of machine learning pipelines, which are then executed e.g. on cloud resources. Parsl # Parsl is another representative of a Python-oriented task runtime. It allows defining tasks that represent either Python function calls or command-line application invocations using Python. Computational resources in Parsl are configured through a block , a set of preconfigured resources (nodes) designed for executing specific kinds of tasks. In addition to blocks, users also have to specify launchers , which determine how will be each task executed (e.g. using a Slurm or an MPI execution command) and also an executor , which controls how will be tasks scheduled and batched into allocations and if the execution will be fault-tolerant. While these options let users specify how will be their task graph executed on a very granular level, it requires them to tune this configuration per task graph or target cluster; the configuration system is also relatively complex. This is in contrast to HyperQueue, which has a fully general resource management model that does not require users to configure anything; tasks are automatically load balanced across all available workers regardless of allocations and workers do not have to be preconfigured for specific tasks. Parsl has basic support for resource requirements, but does not allow creating custom user-specified resource kinds. It also allows specifying the number of nodes assigned to a task; however, such tasks have to be executed within a single block; Parsl does not allow executing multi-node tasks across different blocks or allocations. PyCOMPSs # PyCOMPSs is a Python interface for executing task graphs on top of the COMPSs distributed system. It allows defining arbitrary task graphs and has comprehensive support for multi-node tasks and basic resource requirements, but it does not allow users to define custom resource requirements. It was extended to support configuration of NUMA nodes for individual tasks. In terms of scheduling, it implements several simple scheduling algorithms; users can select which one should be used. Assignment of tasks to allocations is performed in a manual way; users enqueue a task graph (an application), which is then fully executed once that allocation is started. COMPSs provides basic support for automatic allocation that can dynamically react to computational load. However, it can only add or remove nodes from a primary allocation that is always tied to the execution of a single application; it does not provide fully flexible load balancing. PyCOMPSs is slightly more challenging to deploy than most of the other compared task runtimes, since it also requires a Java runtime environment in addition to a Python interpreter. Pegasus # Pegasus is a very general workflow management system that can execute %workflows on a wide range of clusters, from HPC to cloud. It provides support for various additional features that have not been examined in this thesis, such as data provenance or advanced file management and staging. Its workflows are usually defined using workflow files, which enable specifying dependencies both explicitly or by inferring them from input/output files of tasks. It also supports basic resource requirements, but does not allow defining custom resource kinds nor using multi-node tasks. By default, it maps each task to a single allocation, but it also allows users to cluster tasks together using one of several predefined modes. However, users have to configure this clustering manually; it is not performed fully automatically like in HyperQueue. In terms of deployment, it has the most complex set of runtime dependencies out of the compared task runtimes, as it requires not only a Python interpreter and a Java runtime environment, but also the HTCondor workload management system, which can be non-trivial to install on an HPC cluster. Pegasus delegates some of its functionality to HTCondor; it requires a configured instance of HTCondor before it can execute workflows on a cluster. Balsam # Balsam is a task runtime for executing workflows defined using Python on HPC clusters. It uses a similar fully flexible method for mapping tasks to allocations as HyperQueue, including automatic allocation; however, it is limited to a single allocation queue, similarly as in Dask . It supports multi-node tasks, although users have to statically preconfigure workers to either execute single-node or multi-node tasks. It does not allow specifying custom resource kinds nor more advanced resource management offered by HyperQueue, such as resource variants . The Balsam server requires access to a PostgreSQL database instance, which makes its deployment slightly more challenging than some other tools that do not need a database or that can use an embedded database like SQLite. AutoSubmit # AutoSubmit is a high-level tool for executing workflows and experiments. It focuses primarily on experiment tracking, data provenance and workflow automation. In its default mode, each task corresponds to a single allocation, which is not ideal for short running tasks; AutoSubmit is designed primarily for coarse-grained workflows. It provides a way to bundle multiple tasks into the same allocation using wrappers , but same as with e.g. Pegasus , this has to be preconfigured statically by the user; it is not performed automatically. AutoSubmit does not support custom task resource kinds and it also does not support direct data transfers between tasks nor output streaming. FireWorks # FireWorks is a workflow system for managing the execution of workflows on distributed clusters. It allows defining task graphs using either workflow files or through a Python API. It supports fault-tolerant task execution, although failed tasks have to be re-executed manually. FireWorks does not seem to support any task resource requirements; resources can only be configured for individual allocations. Its meta-scheduling approach is relatively complicated; it provides several ways of mapping tasks to allocations and individual workers with different trade-offs rather than providing a unified way that users would not have to worry about. FireWorks requires a MongoDB database to store tasks, which can make its deployment slightly challenging. SnakeMake # SnakeMake is a popular workflow management system for executing coarse-grained workflows defined using workflow files that can be extended with inline Python code. It can operate both as a meta-scheduler (outside of PBS/Slurm) and also as a classical task runtime within a PBS/Slurm job. Its workflows are based on files; tasks are expected to produce and consume files, which are also used to infer dependencies between them. This can pose an issue with a large number of tasks, as the created files can overload distributed filesystems; no output streaming is offered by the task runtime. It enables assigning both known (e.g. CPU or memory) and custom resource kinds to tasks. It also allows specifying the number of nodes required for each task. With SnakeMake, you can submit a workflow either using a task-per-job model (which has high overhead ) or you can partition the workflow into several jobs, but in that case SnakeMake will not provide load balancing across these partitions, and partitioning the jobs manually can be quite arduous. HyperQueue allows you to submit large workflows without partitioning them manually in any way, as the server will dynamically load balance the tasks onto workers from different PBS/Slurm allocations. Since SnakeMake workflows are defined in configuration files, it's a bit more involved to run computations in SnakeMake than in HyperQueue. On the other hand, SnakeMake lets you define more complex workflows with improved traceability and reproducibility. Merlin # Merlin is a task queueing system that enables execution of large workflows on HPC clusters. It leverages the Celery task queue for distributing tasks to workers and the Maestro workflow specification for defining task graphs. Tasks are submitted into separate Celery queues, whose resources need to be preconfigured; its load balancing is thus not fully flexible and automatic like in HyperQueue. It also does not support automatic allocation and nor does it support custom resource kinds. Failed tasks can be automatically restarted if they end with a specific status code; however, if they fail because of unexpected reasons, users have to mark them for re-execution manually. Merlin requires a message broker backend, such as RabbitMQ or Redis, for its functionality, which makes its deployment non-trivial. It corresponds to Table 7.2 from the PhD thesis. \u21a9 If you think that our description is inaccurate or misleading, please file an issue . \u21a9","title":"Comparison With Other Tools"},{"location":"other-tools/#comparison-with-other-task-runtimes","text":"There is a very large number of different task runtimes, with various performance characteristics, feature sets, programming models and trade-offs, and it is of course infeasible to compare HyperQueue with all of them. One of HyperQueue's authors has written a PhD thesis titled Ergonomics and efficiency of workflows on HPC clusters , which includes a section that compares HyperQueue with several other tools. We invite you to examine this section (and the whole thesis) if you want to find out more about the relation of HyperQueue to other task runtimes. The descriptions of other task runtimes presented on this page are actual as of October 2024. Below you can find a table 1 , which compares selected features of twelve task runtimes that we have experience with and/or that we think are relevant for HyperQueue. You can find more information about the table in Section 7.6 of the thesis. Below we also provide opinionated 2 descriptions of selected task runtimes that we think can be reasonable compared with HyperQueue. GNU Parallel HyperShell Dask Ray Parsl PyCOMPSs Pegasus Balsam AutoSubmit FireWorks SnakeMake Merlin","title":"Comparison with other task runtimes"},{"location":"other-tools/#gnu-parallel","text":"GNU Parallel is a command-line utility for executing many tasks in parallel on a set of computational nodes. It does not offer many advanced task runtime features, but it does one thing well; it enables a parallelized and even distributed execution of a set of programs with a single command invocation. HyperQueue takes inspiration from this approach, as it offers a CLI that can be used to execute task graphs with many tasks and complex resource requirements with a single command.","title":"GNU Parallel"},{"location":"other-tools/#hypershell","text":"HyperShell is primarily designed for executing many homogeneous tasks using the command-line. It does introduce several useful features on top of GNU Parallel , such as automatic task re-execution when a task fails and storing the task state in a database, which enables users to observe the history of executed workflows. It also provides a simple autoscaling functionality that automatically submits allocations. However, tasks in HyperShell are strictly tied to allocations; by default, one task is submitted in a single allocation. It does provide the option to bundle several tasks together, but users have to specify the maximum bundle size explicitly, which makes load balancing inflexible. HyperShell does not support task dependencies; therefore, it cannot be used to execute general task graphs.","title":"HyperShell"},{"location":"other-tools/#dask","text":"Dask is a task runtime that is very popular within the Python community, which allows executing arbitrary task graphs composed of Python functions on a distributed cluster. It also supports distributing code using numpy or pandas compatible API. While Dask by itself does not interact with PBS or Slurm, you can use Dask-JobQueue to make it operate in a similar fashion as HyperQueue - with the centralized server running on a login node and the workers running on compute nodes. Dask does not support arbitrary resource requirements and since it is written in Python, it can have problems with scaling to very large task graphs. If your use-case is primarily Python-based though, you should definitely give Dask a try, it's a great tool.","title":"Dask"},{"location":"other-tools/#ray","text":"Ray is a distributed task runtime primarily aimed at parallelizing the training and inference of machine learning models in Python. It uses a relatively unique architecture that leverages distributed scheduling; not all task submission and scheduling decisions need to go through a central location, unlike most other compared task runtimes including HyperQueue. This allows it to scale to an enormous amount of resources, millions of tasks and thousands of nodes. However, in order to enable this level of scalability, the workflow itself has to be implemented in a way where tasks submit new tasks from worker nodes dynamically. Therefore, batch computing use-cases that simply want to execute a predetermined workflow might be unable to achieve such high performance. Same as Dask , it offers basic resource requirements and it also supports fractional resources and related resource groups. However, it does not allow expressing multi-node tasks. In contrast to Dask, it is internally implemented in C++ , which introduces much less overhead than Python. Even though Ray provides some autoscaling functionality, it does not support Slurm or other HPC allocation managers. In general, it is not specialized for HPC idiosyncrasies nor for executing arbitrary task graphs; even though it has a low-level interface for creating tasks through Python functions, it primarily focuses on generating task graphs automatically from high-level descriptions of machine learning pipelines, which are then executed e.g. on cloud resources.","title":"Ray"},{"location":"other-tools/#parsl","text":"Parsl is another representative of a Python-oriented task runtime. It allows defining tasks that represent either Python function calls or command-line application invocations using Python. Computational resources in Parsl are configured through a block , a set of preconfigured resources (nodes) designed for executing specific kinds of tasks. In addition to blocks, users also have to specify launchers , which determine how will be each task executed (e.g. using a Slurm or an MPI execution command) and also an executor , which controls how will be tasks scheduled and batched into allocations and if the execution will be fault-tolerant. While these options let users specify how will be their task graph executed on a very granular level, it requires them to tune this configuration per task graph or target cluster; the configuration system is also relatively complex. This is in contrast to HyperQueue, which has a fully general resource management model that does not require users to configure anything; tasks are automatically load balanced across all available workers regardless of allocations and workers do not have to be preconfigured for specific tasks. Parsl has basic support for resource requirements, but does not allow creating custom user-specified resource kinds. It also allows specifying the number of nodes assigned to a task; however, such tasks have to be executed within a single block; Parsl does not allow executing multi-node tasks across different blocks or allocations.","title":"Parsl"},{"location":"other-tools/#pycompss","text":"PyCOMPSs is a Python interface for executing task graphs on top of the COMPSs distributed system. It allows defining arbitrary task graphs and has comprehensive support for multi-node tasks and basic resource requirements, but it does not allow users to define custom resource requirements. It was extended to support configuration of NUMA nodes for individual tasks. In terms of scheduling, it implements several simple scheduling algorithms; users can select which one should be used. Assignment of tasks to allocations is performed in a manual way; users enqueue a task graph (an application), which is then fully executed once that allocation is started. COMPSs provides basic support for automatic allocation that can dynamically react to computational load. However, it can only add or remove nodes from a primary allocation that is always tied to the execution of a single application; it does not provide fully flexible load balancing. PyCOMPSs is slightly more challenging to deploy than most of the other compared task runtimes, since it also requires a Java runtime environment in addition to a Python interpreter.","title":"PyCOMPSs"},{"location":"other-tools/#pegasus","text":"Pegasus is a very general workflow management system that can execute %workflows on a wide range of clusters, from HPC to cloud. It provides support for various additional features that have not been examined in this thesis, such as data provenance or advanced file management and staging. Its workflows are usually defined using workflow files, which enable specifying dependencies both explicitly or by inferring them from input/output files of tasks. It also supports basic resource requirements, but does not allow defining custom resource kinds nor using multi-node tasks. By default, it maps each task to a single allocation, but it also allows users to cluster tasks together using one of several predefined modes. However, users have to configure this clustering manually; it is not performed fully automatically like in HyperQueue. In terms of deployment, it has the most complex set of runtime dependencies out of the compared task runtimes, as it requires not only a Python interpreter and a Java runtime environment, but also the HTCondor workload management system, which can be non-trivial to install on an HPC cluster. Pegasus delegates some of its functionality to HTCondor; it requires a configured instance of HTCondor before it can execute workflows on a cluster.","title":"Pegasus"},{"location":"other-tools/#balsam","text":"Balsam is a task runtime for executing workflows defined using Python on HPC clusters. It uses a similar fully flexible method for mapping tasks to allocations as HyperQueue, including automatic allocation; however, it is limited to a single allocation queue, similarly as in Dask . It supports multi-node tasks, although users have to statically preconfigure workers to either execute single-node or multi-node tasks. It does not allow specifying custom resource kinds nor more advanced resource management offered by HyperQueue, such as resource variants . The Balsam server requires access to a PostgreSQL database instance, which makes its deployment slightly more challenging than some other tools that do not need a database or that can use an embedded database like SQLite.","title":"Balsam"},{"location":"other-tools/#autosubmit","text":"AutoSubmit is a high-level tool for executing workflows and experiments. It focuses primarily on experiment tracking, data provenance and workflow automation. In its default mode, each task corresponds to a single allocation, which is not ideal for short running tasks; AutoSubmit is designed primarily for coarse-grained workflows. It provides a way to bundle multiple tasks into the same allocation using wrappers , but same as with e.g. Pegasus , this has to be preconfigured statically by the user; it is not performed automatically. AutoSubmit does not support custom task resource kinds and it also does not support direct data transfers between tasks nor output streaming.","title":"AutoSubmit"},{"location":"other-tools/#fireworks","text":"FireWorks is a workflow system for managing the execution of workflows on distributed clusters. It allows defining task graphs using either workflow files or through a Python API. It supports fault-tolerant task execution, although failed tasks have to be re-executed manually. FireWorks does not seem to support any task resource requirements; resources can only be configured for individual allocations. Its meta-scheduling approach is relatively complicated; it provides several ways of mapping tasks to allocations and individual workers with different trade-offs rather than providing a unified way that users would not have to worry about. FireWorks requires a MongoDB database to store tasks, which can make its deployment slightly challenging.","title":"FireWorks"},{"location":"other-tools/#snakemake","text":"SnakeMake is a popular workflow management system for executing coarse-grained workflows defined using workflow files that can be extended with inline Python code. It can operate both as a meta-scheduler (outside of PBS/Slurm) and also as a classical task runtime within a PBS/Slurm job. Its workflows are based on files; tasks are expected to produce and consume files, which are also used to infer dependencies between them. This can pose an issue with a large number of tasks, as the created files can overload distributed filesystems; no output streaming is offered by the task runtime. It enables assigning both known (e.g. CPU or memory) and custom resource kinds to tasks. It also allows specifying the number of nodes required for each task. With SnakeMake, you can submit a workflow either using a task-per-job model (which has high overhead ) or you can partition the workflow into several jobs, but in that case SnakeMake will not provide load balancing across these partitions, and partitioning the jobs manually can be quite arduous. HyperQueue allows you to submit large workflows without partitioning them manually in any way, as the server will dynamically load balance the tasks onto workers from different PBS/Slurm allocations. Since SnakeMake workflows are defined in configuration files, it's a bit more involved to run computations in SnakeMake than in HyperQueue. On the other hand, SnakeMake lets you define more complex workflows with improved traceability and reproducibility.","title":"SnakeMake"},{"location":"other-tools/#merlin","text":"Merlin is a task queueing system that enables execution of large workflows on HPC clusters. It leverages the Celery task queue for distributing tasks to workers and the Maestro workflow specification for defining task graphs. Tasks are submitted into separate Celery queues, whose resources need to be preconfigured; its load balancing is thus not fully flexible and automatic like in HyperQueue. It also does not support automatic allocation and nor does it support custom resource kinds. Failed tasks can be automatically restarted if they end with a specific status code; however, if they fail because of unexpected reasons, users have to mark them for re-execution manually. Merlin requires a message broker backend, such as RabbitMQ or Redis, for its functionality, which makes its deployment non-trivial. It corresponds to Table 7.2 from the PhD thesis. \u21a9 If you think that our description is inaccurate or misleading, please file an issue . \u21a9","title":"Merlin"},{"location":"quickstart/","text":"Here we provide an example of deploying HyperQueue on a local computer and running a simple \"Hello world\" script. Run each of the following three commands in separate terminals. Start the HyperQueue server $ hq server start The server will manage computing resources (workers) and distribute submitted tasks amongst them. Start a HyperQueue worker $ hq worker start The worker will connect to the server and execute submitted tasks. Submit a simple computation $ hq submit echo \"Hello world\" This command will submit a job with a single task that will execute echo \"Hello world\" on a worker. You can find the output of the task in job-1/0.stdout . That's it! For a more in-depth explanation of how HyperQueue works and what it can do, check the Deployment and Jobs sections.","title":"Quickstart"},{"location":"cli/dashboard/","text":"HyperQueue offers a command-line dashboard that shows information about the state of workers and jobs. It can show which jobs are currently queued or running, which tasks are running on which workers, or what is the current hardware utilization of workers. Warning The dashboard is currently in an experimental stage. Some of its features might not work properly, and important features might be missing. Please let us know if you encounter any issues with it, or if you want us to add new features to it. Dashboard disabled Note that the dashboard has been temporarily disabled in HyperQueue 0.19.0 because of internal architectural changes. We plan to re-enable it in the future. You can start the dashboard using the hq dashboard command: $ hq dashboard The dashboard will try to connect to a running HyperQueue server, and display various information. You can navigate the dashboard using your keyboard. Here is an example video that shows how does the dashboard look like:","title":"Dashboard"},{"location":"cli/output-mode/","text":"By default, HyperQueue CLI commands output information in a human-readable way, usually in the form of a table. If you want to use the CLI commands programmatically, HyperQueue offers two additional output modes that are designed to be machine-readable. You can change the output type of any HyperQueue CLI command either by using the --output-mode flag or by setting the HQ_OUTPUT_MODE environment variable. Flag Environment variable $ hq --output-mode = json job list $ HQ_OUTPUT_MODE = json hq job list Currently, there are three output modes available. The default, human-readable cli mode, and then two machine-readable modes, JSON and Quiet . Important Each machine-readable mode supports a set of commands. You can also use commands that are not listed here, but their output might be unstable, or they might not output anything for a given output mode. JSON # The json output mode is intended to provide very detailed information in the form of a JSON value. With this mode, HyperQueue will always output exactly one JSON value, either an array or an object. Error handling # When an error occurs during the execution of a command, the program will exit with exit code 1 and the program will output a JSON object with a single error key containing a human-readable description of the error. Date formatting # Time-based items are formatted in the following way: Duration - formatted as a floating point number of seconds. Datetime (timestamp) - formatted as a ISO8601 date in UTC Supported commands # Server info: hq server info Example { \"host\" : \"my-machine\" , \"hq_port\" : 42189 , \"pid\" : 32586 , \"server_dir\" : \"/foo/bar/.hq-server\" , \"start_date\" : \"2021-12-20T08:45:41.775753188Z\" , \"version\" : \"0.7.0\" , \"worker_port\" : 38627 } Worker list: hq worker list Example [{ \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 }] Worker info: hq worker info Example { \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 } Submit a job: hq submit Example { \"id\" : 1 } Job list: hq job list Example [{ \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }] Job info: hq job info --tasks Example { \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"info\" : { \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }, \"max_fails\" : null , \"pin\" : null , \"priority\" : 0 , \"program\" : { \"args\" : [ \"ls\" ], \"cwd\" : \"%{SUBMIT_DIR}\" , \"env\" : { \"FOO\" : \"BAR\" }, \"stderr\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stderr\" }, \"stdout\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stdout\" } }, \"started_at\" : \"2021-12-20T08:45:53.458919345Z\" , \"tasks\" : [{ \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"id\" : 0 , \"started_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"state\" : \"finished\" , \"worker\" : 1 , \"cwd\" : \"/tmp/foo\" , \"stderr\" : { \"File\" : \"job-1/0.stderr\" }, \"stdout\" : { \"File\" : \"job-1/0.stdout\" } }], \"time_limit\" : null , \"submit_dir\" : \"/foo/bar/submit\" } Automatic allocation queue list: hq alloc list Example [{ \"additional_args\" : [], \"backlog\" : 4 , \"id\" : 1 , \"manager\" : \"PBS\" , \"max_worker_count\" : null , \"name\" : null , \"timelimit\" : 1800.0 , \"worker_cpu_args\" : null , \"worker_resource_args\" : [], \"workers_per_alloc\" : 1 }] Automatic allocation queue info: hq alloc info Example [{ \"id\" : \"pbs-1\" , \"worker_count\" : 4 , \"queue_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"started_at\" : \"2021-12-20T08:58:25.538001256Z\" , \"ended_at\" : null , \"status\" : \"running\" , \"workdir\" : \"/foo/bar\" }] Automatic allocation queue events: hq alloc events Example [{ \"date\" : \"2021-12-20T08:56:16.437123396Z\" , \"event\" : \"allocation-finished\" , \"params\" : { \"id\" : \"pbs-1\" } }, { \"date\" : \"2021-12-20T08:58:16.437123396Z\" , \"event\" : \"status-fail\" , \"params\" : { \"error\" : \"qstat failed\" } }] Quiet # The quiet output mode will cause HyperQueue to output only the most important information that should be parseable without any complex parsing logic, e.g. using only Bash scripts. Error handling # When an error occurs during the execution of a command, the program will exit with exit code 1 and the error will be printed to the standard error output. Supported commands # Submit a job: hq submit Schema Outputs a single line containing the ID of the created job. Example $ hq --output-mode = quiet submit ls 1","title":"Output mode"},{"location":"cli/output-mode/#json","text":"The json output mode is intended to provide very detailed information in the form of a JSON value. With this mode, HyperQueue will always output exactly one JSON value, either an array or an object.","title":"JSON"},{"location":"cli/output-mode/#error-handling","text":"When an error occurs during the execution of a command, the program will exit with exit code 1 and the program will output a JSON object with a single error key containing a human-readable description of the error.","title":"Error handling"},{"location":"cli/output-mode/#date-formatting","text":"Time-based items are formatted in the following way: Duration - formatted as a floating point number of seconds. Datetime (timestamp) - formatted as a ISO8601 date in UTC","title":"Date formatting"},{"location":"cli/output-mode/#supported-commands","text":"Server info: hq server info Example { \"host\" : \"my-machine\" , \"hq_port\" : 42189 , \"pid\" : 32586 , \"server_dir\" : \"/foo/bar/.hq-server\" , \"start_date\" : \"2021-12-20T08:45:41.775753188Z\" , \"version\" : \"0.7.0\" , \"worker_port\" : 38627 } Worker list: hq worker list Example [{ \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 }] Worker info: hq worker info Example { \"configuration\" : { \"heartbeat_interval\" : 8.0 , \"hostname\" : \"my-machine\" , \"idle_timeout\" : null , \"listen_address\" : \"my-machine:45611\" , \"log_dir\" : \"...\" , \"resources\" : { \"cpus\" : [[ 0 , 1 , 2 , 3 ]], \"generic\" : [{ \"kind\" : \"sum\" , \"name\" : \"resource1\" , \"params\" : { \"size\" : 1000 } }] }, \"time_limit\" : null , \"work_dir\" : \"...\" }, \"ended\" : null , \"id\" : 1 } Submit a job: hq submit Example { \"id\" : 1 } Job list: hq job list Example [{ \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }] Job info: hq job info --tasks Example { \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"info\" : { \"id\" : 1 , \"name\" : \"ls\" , \"resources\" : { \"cpus\" : { \"cpus\" : 1 , \"type\" : \"compact\" }, \"generic\" : [], \"min_time\" : 0.0 }, \"task_count\" : 1 , \"task_stats\" : { \"canceled\" : 0 , \"failed\" : 0 , \"finished\" : 1 , \"running\" : 0 , \"waiting\" : 0 } }, \"max_fails\" : null , \"pin\" : null , \"priority\" : 0 , \"program\" : { \"args\" : [ \"ls\" ], \"cwd\" : \"%{SUBMIT_DIR}\" , \"env\" : { \"FOO\" : \"BAR\" }, \"stderr\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stderr\" }, \"stdout\" : { \"File\" : \"job-%{JOB_ID}/%{TASK_ID}.stdout\" } }, \"started_at\" : \"2021-12-20T08:45:53.458919345Z\" , \"tasks\" : [{ \"finished_at\" : \"2021-12-20T08:56:16.438062340Z\" , \"id\" : 0 , \"started_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"state\" : \"finished\" , \"worker\" : 1 , \"cwd\" : \"/tmp/foo\" , \"stderr\" : { \"File\" : \"job-1/0.stderr\" }, \"stdout\" : { \"File\" : \"job-1/0.stdout\" } }], \"time_limit\" : null , \"submit_dir\" : \"/foo/bar/submit\" } Automatic allocation queue list: hq alloc list Example [{ \"additional_args\" : [], \"backlog\" : 4 , \"id\" : 1 , \"manager\" : \"PBS\" , \"max_worker_count\" : null , \"name\" : null , \"timelimit\" : 1800.0 , \"worker_cpu_args\" : null , \"worker_resource_args\" : [], \"workers_per_alloc\" : 1 }] Automatic allocation queue info: hq alloc info Example [{ \"id\" : \"pbs-1\" , \"worker_count\" : 4 , \"queue_at\" : \"2021-12-20T08:56:16.437123396Z\" , \"started_at\" : \"2021-12-20T08:58:25.538001256Z\" , \"ended_at\" : null , \"status\" : \"running\" , \"workdir\" : \"/foo/bar\" }] Automatic allocation queue events: hq alloc events Example [{ \"date\" : \"2021-12-20T08:56:16.437123396Z\" , \"event\" : \"allocation-finished\" , \"params\" : { \"id\" : \"pbs-1\" } }, { \"date\" : \"2021-12-20T08:58:16.437123396Z\" , \"event\" : \"status-fail\" , \"params\" : { \"error\" : \"qstat failed\" } }]","title":"Supported commands"},{"location":"cli/output-mode/#quiet","text":"The quiet output mode will cause HyperQueue to output only the most important information that should be parseable without any complex parsing logic, e.g. using only Bash scripts.","title":"Quiet"},{"location":"cli/output-mode/#error-handling_1","text":"When an error occurs during the execution of a command, the program will exit with exit code 1 and the error will be printed to the standard error output.","title":"Error handling"},{"location":"cli/output-mode/#supported-commands_1","text":"Submit a job: hq submit Schema Outputs a single line containing the ID of the created job. Example $ hq --output-mode = quiet submit ls 1","title":"Supported commands"},{"location":"cli/shortcuts/","text":"Various HyperQueue CLI command options let you enter some value in a specific syntactical format for convenience. Here you can find a list of such shortcuts. ID selector # When you enter (job/task/worker) IDs to various HyperQueue CLI commands, you can use the following selectors to select multiple IDs at once or to reference the most recently created ID: Single ID hq worker stop 1 - stop a worker with ID 1 hq job cancel 5 - cancel a job with ID 5 -: Inclusive range of IDs, starting at start and ending at end with step step hq submit --array=1-10 - create a task array with 10 tasks hq worker stop 1-3 - stop workers with IDs 1 , 2 and 3 hq job cancel 2-10:2 - cancel jobs with IDs 2 , 4 , 6 , 8 and 10 all All valid IDs hq worker stop all - stop all workers hq job cancel all - cancel all jobs last The most recently created ID hq worker stop last - stop most recently connected worker hq job cancel last - cancel most recently submitted job You can also combine the first two types of selectors with a comma. For example, the command $ hq worker stop 1,3,5-8 would stop workers with IDs 1 , 3 , 5 , 6 , 7 and 8 . Tip You can add underscore ( _ ) separators to any of the entered numeric values to improve readability: $ hq submit --array = 1 -1000_000 ... Supported commands and options # hq submit --array= hq worker stop hq job info does not support all (use hq job list instead) hq job cancel hq job wait hq job progress Duration # You can enter durations using various time suffixes, for example: 1h - one hour 3m - three minutes 14s - fourteen seconds 15days 2min 2s - fifteen days, two minutes and two seconds You can also combine these suffixed values together by separating them with a space. The full specification of allowed suffixed can be found here . Supported commands and options # hq worker start --time-limit= hq worker start --idle-timeout= hq alloc add pbs --time-limit= hq submit --time-limit= ... hq submit --time-request= ... Tip For increased compatibility with PBS and Slurm , you can also specify the --time-limit option of hq alloc add using the HH:MM:SS format.","title":"Shortcuts"},{"location":"cli/shortcuts/#id-selector","text":"When you enter (job/task/worker) IDs to various HyperQueue CLI commands, you can use the following selectors to select multiple IDs at once or to reference the most recently created ID: Single ID hq worker stop 1 - stop a worker with ID 1 hq job cancel 5 - cancel a job with ID 5 -: Inclusive range of IDs, starting at start and ending at end with step step hq submit --array=1-10 - create a task array with 10 tasks hq worker stop 1-3 - stop workers with IDs 1 , 2 and 3 hq job cancel 2-10:2 - cancel jobs with IDs 2 , 4 , 6 , 8 and 10 all All valid IDs hq worker stop all - stop all workers hq job cancel all - cancel all jobs last The most recently created ID hq worker stop last - stop most recently connected worker hq job cancel last - cancel most recently submitted job You can also combine the first two types of selectors with a comma. For example, the command $ hq worker stop 1,3,5-8 would stop workers with IDs 1 , 3 , 5 , 6 , 7 and 8 . Tip You can add underscore ( _ ) separators to any of the entered numeric values to improve readability: $ hq submit --array = 1 -1000_000 ...","title":"ID selector"},{"location":"cli/shortcuts/#supported-commands-and-options","text":"hq submit --array= hq worker stop hq job info does not support all (use hq job list instead) hq job cancel hq job wait hq job progress ","title":"Supported commands and options"},{"location":"cli/shortcuts/#duration","text":"You can enter durations using various time suffixes, for example: 1h - one hour 3m - three minutes 14s - fourteen seconds 15days 2min 2s - fifteen days, two minutes and two seconds You can also combine these suffixed values together by separating them with a space. The full specification of allowed suffixed can be found here .","title":"Duration"},{"location":"cli/shortcuts/#supported-commands-and-options_1","text":"hq worker start --time-limit= hq worker start --idle-timeout= hq alloc add pbs --time-limit= hq submit --time-limit= ... hq submit --time-request= ... Tip For increased compatibility with PBS and Slurm , you can also specify the --time-limit option of hq alloc add using the HH:MM:SS format.","title":"Supported commands and options"},{"location":"deployment/","text":"Architecture # HyperQueue has two runtime components: Server : a long-lived component which can run e.g. on a login node of a computing cluster. It handles task submitted by the user, manages and asks for HPC resources (PBS/Slurm jobs) and distributes tasks to available workers. Worker : runs on a computing node and actually executes submitted tasks. Server and the workers communicate over encrypted TCP/IP channels. The server may run on any machine, as long as the workers are able to connect to it over TCP/IP. Connecting in the other direction (from the server machine to the worker nodes) is not required. A common use-case is to start the server on a login of an HPC system. Learn more about deploying server and the workers .","title":"Architecture"},{"location":"deployment/#architecture","text":"HyperQueue has two runtime components: Server : a long-lived component which can run e.g. on a login node of a computing cluster. It handles task submitted by the user, manages and asks for HPC resources (PBS/Slurm jobs) and distributes tasks to available workers. Worker : runs on a computing node and actually executes submitted tasks. Server and the workers communicate over encrypted TCP/IP channels. The server may run on any machine, as long as the workers are able to connect to it over TCP/IP. Connecting in the other direction (from the server machine to the worker nodes) is not required. A common use-case is to start the server on a login of an HPC system. Learn more about deploying server and the workers .","title":"Architecture"},{"location":"deployment/allocation/","text":"Automatic allocation is one of the core features of HyperQueue. When you run HyperQueue on an HPC cluster, it allows you to autonomously ask the job manager (PBS/Slurm) for computing resources and spawn HyperQueue workers on the provided nodes. Using this mechanism, you can submit computations into HyperQueue without caring about the underlying PBS/Slurm jobs. Job terminology It is common to use the term \"job\" for jobs created by an HPC job manager, such as PBS or Slurm, which are used to perform computations on HPC clusters. However, HyperQueue also uses the term \"job\" for ensembles of tasks . To differentiate between these two, we will refer to jobs created by PBS or Slurm as allocations . We will also refer to PBS/Slurm as a job manager . Allocation queue # To enable automatic allocation, you have to create an Allocation queue . It describes a specific configuration that will be used by HyperQueue to request computing resources from the job manager on your behalf. Each allocation queue has a set of parameters . You can use them to modify the behavior of automatic allocation, but for start you can simply use the defaults. However, you will almost certainly need to specify some credentials to be able to ask for computing resources using PBS/Slurm. To create a new allocation queue, use the following command and pass any required credentials (queue/partition name, account ID, etc.) after -- . These trailing arguments will then be passed directly to qsub / sbatch : PBS Slurm $ hq alloc add pbs --time-limit 1h -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h -- --partition = p1 Tip Make sure that a HyperQueue server is running when you execute this command. Allocation queues are not persistent, so you have to set them up each time you (re)start the server. Warning Do not pass the number of nodes that should be allocated or the allocation walltime using these trailing arguments. These parameters are configured using other means, see below . Once the queue is created, HyperQueue will start asking for allocations in order to provide computing resources (HyperQueue workers). The exact behavior of the automatic allocation process is described below . You can create multiple allocation queues, and you can even combine PBS queues with Slurm queues. Warning Note that the HQ server needs to have access to qsub or sbatch binaries on the node where it is executed. If you want to submit PBS/Slurm allocations on a remote cluster, you will need to use e.g. a proxy to redirect the commands to that cluster. See this issue for more information. If you have a use-case for such remote PBS/Slurm allocation submission, please let us know , as we could try to make that easier in HyperQueue if there was enough interest in it. Parameters # In addition to arguments that are passed to qsub / sbatch , you can also use several other command line options when creating a new allocation queue: Time limit # Format 1 : --time-limit Sets the walltime of created allocations. This parameter is required , as HyperQueue must know the duration of the individual allocations. Make sure that you pass a time limit that does not exceed the limit of the PBS/Slurm queue that you intend to use, otherwise the allocation submissions will fail. You can use the dry-run command to debug this. Workers in this allocation queue will be by default created with a time limit equal to the time limit of the queue (unless overridden with Worker time limit ). Important If you specify a time request for a task, you should be aware that the time limit for the allocation queue should be larger than the time request if you want to run this task on workers created by this allocations queue, because it will always take some time before a worker is fully initialized. For example, if you set --time-request 1h when submitting a task, and --time-limit 1h when creating an allocation queue, this task will never get scheduled on workers from this queue. Backlog # Format: --backlog How many allocations should be queued (waiting to be started) in PBS/Slurm at any given time. Has to be a positive integer. Workers per allocation # Format: --workers-per-alloc How many workers should be requested in each allocation. This corresponds to the number of requested nodes, as the allocator will always create a single worker per node. Max worker count # Format: --max-worker-count Maximum number of workers that can be queued or running in the allocation queue. The total amount of workers will be usually limited by the manager (PBS/Slurm), but you can use this parameter to make the limit smaller, for example if you also want to create manager allocations outside HyperQueue. Worker resources # You can specify CPU and generic resources of workers spawned by the allocation queue. The name and syntax of these parameters is the same as when you create a worker manually : PBS Slurm $ hq alloc add pbs --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- --partition = p1 If you do not pass any resources, they will be detected automatically (same as it works with hq worker start ). Idle timeout # Format 1 : --idle-timeout Sets the idle timeout for workers started by the allocation queue. We suggest that you do not use a long duration for this parameter, as it can result in wasting precious allocation time. Worker start command # Format: --worker-start-cmd Specifies a shell command that will be executed on each allocated node just before a worker is started on that node. You can use it e.g. to initialize some shared environment for the node, or to load software modules. Worker stop command # Format: --worker-stop-cmd Specifies a shell command that will be executed on each allocated node just after the worker stops on that node. You can use it e.g. to clean up a previously initialized environment for the node. Warning The execution of this command is best-effort! It is not guaranteed that the command will always be executed. For example, PBS/Slurm can kill the allocation without giving HQ a chance to run the command. Worker time limit # Format 1 : --worker-time-limit Sets the time limit of workers spawned by the allocation queue. After the time limit expires, the worker will be stopped. By default, the worker time limit is set to the time limit of the allocation queue. But if you want, you can shorten it with this flag to make the worker exit sooner, for example to give more time for a worker stop command to execute. Note This command is not designed to stop workers early if they have nothing to do. This functionality is provided by idle timeout . Name # Format: --name Name of the allocation queue. It will be used to name allocations submitted to the job manager. Serves for debug purposes only. Behavior # The automatic allocator will submit allocations to make sure that there are is a specific number of allocations waiting to be started by the job manager. This number is called backlog and you can set it when creating the queue. For example, if backlog was set to 4 and there is currently only one allocation queued into the job manager, the allocator would queue three more allocations. The backlog serves to pre-queue allocations, because it can take some time before the job manager starts them, and also as a load balancing factor, since it will allocate as many resources as the job manager allows. Note The backlog value does not limit the number of running allocations, only the number of queued allocations. Warning Do not set the backlog to a large number to avoid overloading the job manager. When an allocation starts, a HyperQueue worker will start and connect to the HyperQueue server that queued the allocation. The worker has the idle timeout set to five minutes, therefore it will terminate if it doesn't receive any new tasks for five minutes. Stopping automatic allocation # If you want to remove an allocation queue, use the following command: $ hq alloc remove When an allocation queue is removed, all its corresponding queued and running allocations will be canceled immediately. By default, HQ will not allow you to remove an allocation queue that contains a running allocation. If you want to force its removal, use the --force flag. When the HQ server stops, it will automatically remove all allocation queues and cleanup all allocations. Debugging automatic allocation # Since the automatic allocator is a \"background\" process that interacts with an external job manager, it can be challenging to debug its behavior. To aid with this process, HyperQueue provides a \"dry-run\" command that you can use to test allocation parameters. HyperQueue also provides various sources of information that can help you find out what is going on. To mitigate the case of incorrectly entered allocation parameters, HQ will also try to submit a test allocation (do a \"dry run\") into the target HPC job manager when you add a new allocation queue. If the test allocation fails, the queue will not be created. You can avoid this behaviour by passing the --no-dry-run flag to hq alloc add . There are also additional safety limits. If 10 allocations in a succession fail to be submitted, or if 3 allocations that were submitted fail during runtime in a succession, the corresponding allocation queue will be automatically removed. Dry-run command # To test whether PBS/Slurm will accept the submit parameters that you provide to the auto allocator without creating an allocation queue, you can use the dry-run command. It accepts the same parameters as hq alloc add , which it will use to immediately submit an allocation and print any encountered errors. $ hq alloc dry-run pbs --timelimit 2h -- q qexp -A Project1 If the allocation was submitted successfully, it will be canceled immediately to avoid wasting resources. Finding information about allocations # Basic queue information This command will show you details about allocations created by the automatic allocator. Extended logging To get more information about what is happening inside the allocator, start the HyperQueue server with the following environment variable: $ RUST_LOG = hyperqueue::server::autoalloc = debug hq server start The log output of the server will then contain a detailed trace of allocator actions. Allocation files Each time the allocator queues an allocation into the job manager, it will write the submitted bash script, allocation ID and stdout and stderr of the allocation to disk. You can find these files inside the server directory: $ ls /hq-current/autoalloc/// stderr stdout job-id hq-submit.sh Useful autoalloc commands # Here is a list of useful commands to manage automatic allocation: Display a list of all allocation queues # $ hq alloc list Display information about an allocation queue # $ hq alloc info You can filter allocations by their state ( queued , running , finished , failed ) using the --filter option. You can use various shortcuts for the duration value. \u21a9 \u21a9 \u21a9","title":"Automatic Allocation"},{"location":"deployment/allocation/#allocation-queue","text":"To enable automatic allocation, you have to create an Allocation queue . It describes a specific configuration that will be used by HyperQueue to request computing resources from the job manager on your behalf. Each allocation queue has a set of parameters . You can use them to modify the behavior of automatic allocation, but for start you can simply use the defaults. However, you will almost certainly need to specify some credentials to be able to ask for computing resources using PBS/Slurm. To create a new allocation queue, use the following command and pass any required credentials (queue/partition name, account ID, etc.) after -- . These trailing arguments will then be passed directly to qsub / sbatch : PBS Slurm $ hq alloc add pbs --time-limit 1h -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h -- --partition = p1 Tip Make sure that a HyperQueue server is running when you execute this command. Allocation queues are not persistent, so you have to set them up each time you (re)start the server. Warning Do not pass the number of nodes that should be allocated or the allocation walltime using these trailing arguments. These parameters are configured using other means, see below . Once the queue is created, HyperQueue will start asking for allocations in order to provide computing resources (HyperQueue workers). The exact behavior of the automatic allocation process is described below . You can create multiple allocation queues, and you can even combine PBS queues with Slurm queues. Warning Note that the HQ server needs to have access to qsub or sbatch binaries on the node where it is executed. If you want to submit PBS/Slurm allocations on a remote cluster, you will need to use e.g. a proxy to redirect the commands to that cluster. See this issue for more information. If you have a use-case for such remote PBS/Slurm allocation submission, please let us know , as we could try to make that easier in HyperQueue if there was enough interest in it.","title":"Allocation queue"},{"location":"deployment/allocation/#parameters","text":"In addition to arguments that are passed to qsub / sbatch , you can also use several other command line options when creating a new allocation queue:","title":"Parameters"},{"location":"deployment/allocation/#time-limit","text":"Format 1 : --time-limit Sets the walltime of created allocations. This parameter is required , as HyperQueue must know the duration of the individual allocations. Make sure that you pass a time limit that does not exceed the limit of the PBS/Slurm queue that you intend to use, otherwise the allocation submissions will fail. You can use the dry-run command to debug this. Workers in this allocation queue will be by default created with a time limit equal to the time limit of the queue (unless overridden with Worker time limit ). Important If you specify a time request for a task, you should be aware that the time limit for the allocation queue should be larger than the time request if you want to run this task on workers created by this allocations queue, because it will always take some time before a worker is fully initialized. For example, if you set --time-request 1h when submitting a task, and --time-limit 1h when creating an allocation queue, this task will never get scheduled on workers from this queue.","title":"Time limit"},{"location":"deployment/allocation/#backlog","text":"Format: --backlog How many allocations should be queued (waiting to be started) in PBS/Slurm at any given time. Has to be a positive integer.","title":"Backlog"},{"location":"deployment/allocation/#workers-per-allocation","text":"Format: --workers-per-alloc How many workers should be requested in each allocation. This corresponds to the number of requested nodes, as the allocator will always create a single worker per node.","title":"Workers per allocation"},{"location":"deployment/allocation/#max-worker-count","text":"Format: --max-worker-count Maximum number of workers that can be queued or running in the allocation queue. The total amount of workers will be usually limited by the manager (PBS/Slurm), but you can use this parameter to make the limit smaller, for example if you also want to create manager allocations outside HyperQueue.","title":"Max worker count"},{"location":"deployment/allocation/#worker-resources","text":"You can specify CPU and generic resources of workers spawned by the allocation queue. The name and syntax of these parameters is the same as when you create a worker manually : PBS Slurm $ hq alloc add pbs --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- -qqprod -AAccount1 $ hq alloc add slurm --time-limit 1h --cpus 4x4 --resource \"gpus/nvidia=range(1-2)\" -- --partition = p1 If you do not pass any resources, they will be detected automatically (same as it works with hq worker start ).","title":"Worker resources"},{"location":"deployment/allocation/#idle-timeout","text":"Format 1 : --idle-timeout Sets the idle timeout for workers started by the allocation queue. We suggest that you do not use a long duration for this parameter, as it can result in wasting precious allocation time.","title":"Idle timeout"},{"location":"deployment/allocation/#worker-start-command","text":"Format: --worker-start-cmd Specifies a shell command that will be executed on each allocated node just before a worker is started on that node. You can use it e.g. to initialize some shared environment for the node, or to load software modules.","title":"Worker start command"},{"location":"deployment/allocation/#worker-stop-command","text":"Format: --worker-stop-cmd Specifies a shell command that will be executed on each allocated node just after the worker stops on that node. You can use it e.g. to clean up a previously initialized environment for the node. Warning The execution of this command is best-effort! It is not guaranteed that the command will always be executed. For example, PBS/Slurm can kill the allocation without giving HQ a chance to run the command.","title":"Worker stop command"},{"location":"deployment/allocation/#worker-time-limit","text":"Format 1 : --worker-time-limit Sets the time limit of workers spawned by the allocation queue. After the time limit expires, the worker will be stopped. By default, the worker time limit is set to the time limit of the allocation queue. But if you want, you can shorten it with this flag to make the worker exit sooner, for example to give more time for a worker stop command to execute. Note This command is not designed to stop workers early if they have nothing to do. This functionality is provided by idle timeout .","title":"Worker time limit"},{"location":"deployment/allocation/#name","text":"Format: --name Name of the allocation queue. It will be used to name allocations submitted to the job manager. Serves for debug purposes only.","title":"Name"},{"location":"deployment/allocation/#behavior","text":"The automatic allocator will submit allocations to make sure that there are is a specific number of allocations waiting to be started by the job manager. This number is called backlog and you can set it when creating the queue. For example, if backlog was set to 4 and there is currently only one allocation queued into the job manager, the allocator would queue three more allocations. The backlog serves to pre-queue allocations, because it can take some time before the job manager starts them, and also as a load balancing factor, since it will allocate as many resources as the job manager allows. Note The backlog value does not limit the number of running allocations, only the number of queued allocations. Warning Do not set the backlog to a large number to avoid overloading the job manager. When an allocation starts, a HyperQueue worker will start and connect to the HyperQueue server that queued the allocation. The worker has the idle timeout set to five minutes, therefore it will terminate if it doesn't receive any new tasks for five minutes.","title":"Behavior"},{"location":"deployment/allocation/#stopping-automatic-allocation","text":"If you want to remove an allocation queue, use the following command: $ hq alloc remove When an allocation queue is removed, all its corresponding queued and running allocations will be canceled immediately. By default, HQ will not allow you to remove an allocation queue that contains a running allocation. If you want to force its removal, use the --force flag. When the HQ server stops, it will automatically remove all allocation queues and cleanup all allocations.","title":"Stopping automatic allocation"},{"location":"deployment/allocation/#debugging-automatic-allocation","text":"Since the automatic allocator is a \"background\" process that interacts with an external job manager, it can be challenging to debug its behavior. To aid with this process, HyperQueue provides a \"dry-run\" command that you can use to test allocation parameters. HyperQueue also provides various sources of information that can help you find out what is going on. To mitigate the case of incorrectly entered allocation parameters, HQ will also try to submit a test allocation (do a \"dry run\") into the target HPC job manager when you add a new allocation queue. If the test allocation fails, the queue will not be created. You can avoid this behaviour by passing the --no-dry-run flag to hq alloc add . There are also additional safety limits. If 10 allocations in a succession fail to be submitted, or if 3 allocations that were submitted fail during runtime in a succession, the corresponding allocation queue will be automatically removed.","title":"Debugging automatic allocation"},{"location":"deployment/allocation/#dry-run-command","text":"To test whether PBS/Slurm will accept the submit parameters that you provide to the auto allocator without creating an allocation queue, you can use the dry-run command. It accepts the same parameters as hq alloc add , which it will use to immediately submit an allocation and print any encountered errors. $ hq alloc dry-run pbs --timelimit 2h -- q qexp -A Project1 If the allocation was submitted successfully, it will be canceled immediately to avoid wasting resources.","title":"Dry-run command"},{"location":"deployment/allocation/#finding-information-about-allocations","text":"Basic queue information This command will show you details about allocations created by the automatic allocator. Extended logging To get more information about what is happening inside the allocator, start the HyperQueue server with the following environment variable: $ RUST_LOG = hyperqueue::server::autoalloc = debug hq server start The log output of the server will then contain a detailed trace of allocator actions. Allocation files Each time the allocator queues an allocation into the job manager, it will write the submitted bash script, allocation ID and stdout and stderr of the allocation to disk. You can find these files inside the server directory: $ ls /hq-current/autoalloc/// stderr stdout job-id hq-submit.sh","title":"Finding information about allocations"},{"location":"deployment/allocation/#useful-autoalloc-commands","text":"Here is a list of useful commands to manage automatic allocation:","title":"Useful autoalloc commands"},{"location":"deployment/allocation/#display-a-list-of-all-allocation-queues","text":"$ hq alloc list","title":"Display a list of all allocation queues"},{"location":"deployment/allocation/#display-information-about-an-allocation-queue","text":"$ hq alloc info You can filter allocations by their state ( queued , running , finished , failed ) using the --filter option. You can use various shortcuts for the duration value. \u21a9 \u21a9 \u21a9","title":"Display information about an allocation queue"},{"location":"deployment/cloud/","text":"Starting HQ without shared file system # On system without shared file system, all what is needed is to distribute access file ( access.json ) to clients and workers. This file contains address and port where server is running and secret keys. By default, client and worker search for access.json in $HOME/.hq-server . Generate access file in advance # In many cases you, we want to generate an access file in advance before any server is started; moreover, we do not want to regenerate secret keys in every start of server, because we do not want to redistribute access when server is restarted. To solve this, an access file can be generated in advance by command \"generate-access\", e.g.: $ hq server generate-access myaccess.json --client-port=6789 --worker-port=1234 This generates myaccess.json that contains generates keys and host information. The server can be later started with this configuration as follows: $ hq server start --access-file=myaccess.json Note: That server still generates and manages \"own\" access.json in the server directory path. For connecting clients and workers you can use both, myaccess.json or newly generated access.json , they are same. Example of starting a worker from myaccess.json $ mv myaccess.json /mydirectory/access.json $ hq --server-dir=/mydirectory worker start Splitting access for client and workers # Access file contains two secret keys and two points to connect, for clients and for workers. This information can be divided into two separate files, containing only information needed only by clients or only by workers. $ hq server generate-access full.json --client-file=client.json --worker-file=worker.json --client-port=6789 --worker-port=1234 This command creates three files: full.json , client.json , worker.json . For starting a client you can use client.json as access.json while it does not contain information for workers. For starting a worker you can use worker.json as access.json while it does not contain information for clients. For starting server ( hq server start --access-file=... ) you have to use full.json as it contains all necessary information. Setting different server hostname for workers and clients # You can use the following command to configure different hostnames under which the server is visible to workers and clients. hq server generate-access full.json --worker-host= --client-host= ...","title":"Without Shared Filesystem"},{"location":"deployment/cloud/#starting-hq-without-shared-file-system","text":"On system without shared file system, all what is needed is to distribute access file ( access.json ) to clients and workers. This file contains address and port where server is running and secret keys. By default, client and worker search for access.json in $HOME/.hq-server .","title":"Starting HQ without shared file system"},{"location":"deployment/cloud/#generate-access-file-in-advance","text":"In many cases you, we want to generate an access file in advance before any server is started; moreover, we do not want to regenerate secret keys in every start of server, because we do not want to redistribute access when server is restarted. To solve this, an access file can be generated in advance by command \"generate-access\", e.g.: $ hq server generate-access myaccess.json --client-port=6789 --worker-port=1234 This generates myaccess.json that contains generates keys and host information. The server can be later started with this configuration as follows: $ hq server start --access-file=myaccess.json Note: That server still generates and manages \"own\" access.json in the server directory path. For connecting clients and workers you can use both, myaccess.json or newly generated access.json , they are same. Example of starting a worker from myaccess.json $ mv myaccess.json /mydirectory/access.json $ hq --server-dir=/mydirectory worker start","title":"Generate access file in advance"},{"location":"deployment/cloud/#splitting-access-for-client-and-workers","text":"Access file contains two secret keys and two points to connect, for clients and for workers. This information can be divided into two separate files, containing only information needed only by clients or only by workers. $ hq server generate-access full.json --client-file=client.json --worker-file=worker.json --client-port=6789 --worker-port=1234 This command creates three files: full.json , client.json , worker.json . For starting a client you can use client.json as access.json while it does not contain information for workers. For starting a worker you can use worker.json as access.json while it does not contain information for clients. For starting server ( hq server start --access-file=... ) you have to use full.json as it contains all necessary information.","title":"Splitting access for client and workers"},{"location":"deployment/cloud/#setting-different-server-hostname-for-workers-and-clients","text":"You can use the following command to configure different hostnames under which the server is visible to workers and clients. hq server generate-access full.json --worker-host= --client-host= ...","title":"Setting different server hostname for workers and clients"},{"location":"deployment/server/","text":"The server is a crucial component of HyperQueue which manages workers and jobs . Before running any computations or deploying workers, you must first start the server. Starting the server # The server can be started by running the following command: $ hq server start You can change the hostname under which the server is visible to workers with the --host option: $ hq server start --host = HOST Server directory # When the server is started, it creates a server directory where it stores information needed for submitting jobs and connecting workers . This directory is then used to select a running HyperQueue instance. By default, the server directory will be stored in $HOME/.hq-server . This location may be changed with the option --server-dir= , which is available for all HyperQueue CLI commands. You can run more instances of HyperQueue under the same Unix user, by making them use different server directories. If you use a non-default server directory, make sure to pass the same --server-dir to all HyperQueue commands that should use the selected HyperQueue server: $ hq --server-dir = foo server start & $ hq --server-dir = foo worker start Tip To avoid having to pass the --server-dir parameter to all hq commands separately, you can also pass it through the HQ_SERVER_DIR environment variable, and export it to share it for all commands in the same terminal session: $ export HQ_SERVER_DIR = bar $ hq server start & $ hq worker start & Important When you start the server, it will create a new subdirectory in the server directory, which will store the data of the current running instance. It will also create a symlink hq-current which will point to the currently active subdirectory. Using this approach, you can start a server using the same server directory multiple times without overwriting data of the previous runs. Server directory access Encryption keys are stored in the server directory. Whoever has access to the server directory may submit jobs, connect workers to the server and decrypt communication between HyperQueue components. By default, the directory is only accessible by the user who started the server. Keeping the server alive # The server is supposed to be a long-lived component. If you shut it down, all workers will disconnect and all computations will be stopped. Therefore, it is important to make sure that the server will stay running e.g. even after you disconnect from a cluster where the server is deployed. For example, if you SSH into a login node of an HPC cluster and then run the server like this: $ hq server start The server will quit when your SSH session ends, because it will receive a SIGHUP signal. You can use established Unix approaches to avoid this behavior, for example prepending the command with nohup or using a terminal multiplexer like tmux . Resuming stopped/crashed server # The server supports resilience, which allows it to restore its state after it is stopped or if it crashes. To enable resilience, you can tell the server to log events into a journal file, using the --journal flag: $ hq server start --journal /path/to/journal If the server is stopped or it crashes, and you use the same command to start the server (using the same journal file path), it will continue from the last point: $ hq server start --journal /path/to/journal This functionality restores the state of jobs and automatic allocation queues. However, it does not restore worker connections; in the current version, new workers have to be connected to the server after it restarts. Warning If the server crashes, the last few seconds of progress may be lost. For example, when a task is finished and the server crashes before the journal is written, then after resuming the server, the task will be not be computed after a server restart. Exporting journal events # If you'd like to programmatically analyze events that are stored in the journal file, you can export them to JSON using the following command: $ hq journal export The events will be read from the provided journal and printed to stdout encoded in JSON, one event per line (this corresponds to line-delimited JSON, i.e. NDJSON ). You can also directly stream events in real-time from the server using the following command: $ hq journal stream Warning The JSON format of the journal events and their definition is currently unstable and can change with a new HyperQueue version. Stopping server # You can stop a running server with the following command: $ hq server stop When a server is stopped, all running jobs and connected workers will be immediately stopped.","title":"Server"},{"location":"deployment/server/#starting-the-server","text":"The server can be started by running the following command: $ hq server start You can change the hostname under which the server is visible to workers with the --host option: $ hq server start --host = HOST","title":"Starting the server"},{"location":"deployment/server/#server-directory","text":"When the server is started, it creates a server directory where it stores information needed for submitting jobs and connecting workers . This directory is then used to select a running HyperQueue instance. By default, the server directory will be stored in $HOME/.hq-server . This location may be changed with the option --server-dir= , which is available for all HyperQueue CLI commands. You can run more instances of HyperQueue under the same Unix user, by making them use different server directories. If you use a non-default server directory, make sure to pass the same --server-dir to all HyperQueue commands that should use the selected HyperQueue server: $ hq --server-dir = foo server start & $ hq --server-dir = foo worker start Tip To avoid having to pass the --server-dir parameter to all hq commands separately, you can also pass it through the HQ_SERVER_DIR environment variable, and export it to share it for all commands in the same terminal session: $ export HQ_SERVER_DIR = bar $ hq server start & $ hq worker start & Important When you start the server, it will create a new subdirectory in the server directory, which will store the data of the current running instance. It will also create a symlink hq-current which will point to the currently active subdirectory. Using this approach, you can start a server using the same server directory multiple times without overwriting data of the previous runs. Server directory access Encryption keys are stored in the server directory. Whoever has access to the server directory may submit jobs, connect workers to the server and decrypt communication between HyperQueue components. By default, the directory is only accessible by the user who started the server.","title":"Server directory"},{"location":"deployment/server/#keeping-the-server-alive","text":"The server is supposed to be a long-lived component. If you shut it down, all workers will disconnect and all computations will be stopped. Therefore, it is important to make sure that the server will stay running e.g. even after you disconnect from a cluster where the server is deployed. For example, if you SSH into a login node of an HPC cluster and then run the server like this: $ hq server start The server will quit when your SSH session ends, because it will receive a SIGHUP signal. You can use established Unix approaches to avoid this behavior, for example prepending the command with nohup or using a terminal multiplexer like tmux .","title":"Keeping the server alive"},{"location":"deployment/server/#resuming-stoppedcrashed-server","text":"The server supports resilience, which allows it to restore its state after it is stopped or if it crashes. To enable resilience, you can tell the server to log events into a journal file, using the --journal flag: $ hq server start --journal /path/to/journal If the server is stopped or it crashes, and you use the same command to start the server (using the same journal file path), it will continue from the last point: $ hq server start --journal /path/to/journal This functionality restores the state of jobs and automatic allocation queues. However, it does not restore worker connections; in the current version, new workers have to be connected to the server after it restarts. Warning If the server crashes, the last few seconds of progress may be lost. For example, when a task is finished and the server crashes before the journal is written, then after resuming the server, the task will be not be computed after a server restart.","title":"Resuming stopped/crashed server"},{"location":"deployment/server/#exporting-journal-events","text":"If you'd like to programmatically analyze events that are stored in the journal file, you can export them to JSON using the following command: $ hq journal export The events will be read from the provided journal and printed to stdout encoded in JSON, one event per line (this corresponds to line-delimited JSON, i.e. NDJSON ). You can also directly stream events in real-time from the server using the following command: $ hq journal stream Warning The JSON format of the journal events and their definition is currently unstable and can change with a new HyperQueue version.","title":"Exporting journal events"},{"location":"deployment/server/#stopping-server","text":"You can stop a running server with the following command: $ hq server stop When a server is stopped, all running jobs and connected workers will be immediately stopped.","title":"Stopping server"},{"location":"deployment/worker/","text":"Workers connect to a running instance of a HyperQueue server and wait for task assignments. Once some task is assigned to them, they will compute it and notify the server of its completion. Starting workers # Workers should be started on machines that will actually execute the submitted computations, e.g. computing nodes on an HPC cluster. You can either use the automatic allocation system of HyperQueue to start workers as needed, or deploy workers manually. Automatic worker deployment (recommended) # If you are using a job manager (PBS or Slurm) on an HPC cluster, the easiest way of deploying workers is to use Automatic allocation . It is a component of HyperQueue that takes care of submitting PBS/Slurm jobs and spawning HyperQueue workers. Manual worker deployment # If you want to start a worker manually, you can use the following command: $ hq worker start Each worker will be assigned a unique ID that you can use in later commands to query information about the worker or to stop it. By default, the worker will try to connect to a server using the default server directory . If you want to connect to a different server, use the --server-dir option. Sharing the server directory When you start a worker, it will need to read the server directory to find out how to connect to the server. The directory thus has to be accesible both by the server and the worker machines. On HPC clusters, it is common that login nodes and compute nodes use a shared filesystem, so this shouldn't be a problem. However, if a shared filesystem is not available on your cluster, you can just copy the server directory from the server machine to the worker machine and access it from there. The worker machine still has to be able to initiate a TCP/IP connection to the server machine though. Deploying a worker using PBS/Slurm # If you want to manually start a worker using PBS or Slurm, simply use the corresponding submit command ( qsub or sbatch ) and run the hq worker start command inside the allocated job. If you want to start a worker on each allocated node, you can run this command on each node using e.g. mpirun . Example submission script: PBS Slurm #!/bin/bash #PBS -q # Run a worker on the main node //hq worker start --manager pbs # Run a worker on all allocated nodes ml OpenMPI pbsdsh //hq worker start --manager pbs #!/bin/bash #SBATCH --partition # Run a worker on the main node //hq worker start --manager slurm # Run a worker on all allocated nodes ml OpenMPI srun --overlap //hq worker start --manager slurm The worker will try to automatically detect that it is started under a PBS/Slurm job, but you can also explicitly pass the option --manager to tell the worker that it should expect a specific environment. Stopping workers # If you have started a worker manually, and you want to stop it, you can use the hq worker stop command 1 : $ hq worker stop Time limit # HyperQueue workers are designed to be volatile, i.e. it is expected that they will be stopped from time to time, because they are often started inside PBS/Slurm jobs that have a limited duration. It is very useful for the workers to know how much remaining time (\"lifetime\") do they have until they will be stopped. This duration is called the Worker time limit . When a worker is started manually inside a PBS or Slurm job, it will automatically calculate the time limit from the job's metadata. If you want to set time limit for workers started outside of PBS/Slurm jobs or if you want to override the detected settings, you can use the --time-limit= option 2 when starting the worker. When the time limit is reached, the worker is automatically terminated. The time limit of a worker affects what tasks can be scheduled to it. For example, a task submitted with --time-request 10m will not be scheduled onto a worker that only has a remaining time limit of 5 minutes. Idle timeout # When you deploy HQ workers inside a PBS or Slurm job, keeping the worker alive will drain resources from your accounting project (unless you use a free queue). If a worker has nothing to do, it might be better to terminate it sooner to avoid paying these costs for no reason. You can achieve this using Worker idle timeout . If you use it, the worker will automatically stop if it receives no task to compute for the specified duration. For example, if you set the idle duration to five minutes, the worker will stop once it hadn't received any task to compute for five minutes. You can set the idle timeout using the --idle-timeout option 2 when starting the worker. Tip Workers started automatically have the idle timeout set to five minutes. Idle timeout can also be configured globally for all workers using the --idle-timeout option when starting a server: $ hq server start --idle-timeout = This value will be then used for each worker that does not explicitly specify its own idle timeout. Worker state # Each worker can be in one of the following states: Running Worker is running and is able to process tasks Connection lost Worker lost connection to the server. Probably someone manually killed the worker or job walltime in its PBS/Slurm job was reached . Heartbeat lost Communication between server and worker was interrupted. It usually signifies a network problem or a hardware crash of the computational node. Stopped Worker was stopped . Idle timeout Worker was terminated due to Idle timeout . Lost connection to the server # The behavior of what should happen with a worker that lost its connection to the server is configured via hq worker start --on-server-lost= . You can select from two policies: stop - The worker immediately terminates and kills all currently running tasks. finish-running - The worker does not start to execute any new tasks, but it tries to finish tasks that are already running. When all such tasks finish, the worker will terminate. stop is the default policy when a worker is manually started by hq worker start . When a worker is started by the automatic allocator , then finish-running is used as the default value. Useful worker commands # Here is a list of useful worker commands: Display worker list # This command will display a list of workers that are currently connected to the server: $ hq worker list If you also want to include workers that are offline (i.e. that have crashed or disconnected in the past), pass the --all flag to the list command. Display information about a specific worker # $ hq worker info Worker groups # Each worker is a member exactly of one group. Groups are used when multi-node tasks are used. See more here You can use various shortcuts to select multiple workers at once. \u21a9 You can use various shortcuts for the duration value. \u21a9 \u21a9","title":"Workers"},{"location":"deployment/worker/#starting-workers","text":"Workers should be started on machines that will actually execute the submitted computations, e.g. computing nodes on an HPC cluster. You can either use the automatic allocation system of HyperQueue to start workers as needed, or deploy workers manually.","title":"Starting workers"},{"location":"deployment/worker/#automatic-worker-deployment-recommended","text":"If you are using a job manager (PBS or Slurm) on an HPC cluster, the easiest way of deploying workers is to use Automatic allocation . It is a component of HyperQueue that takes care of submitting PBS/Slurm jobs and spawning HyperQueue workers.","title":"Automatic worker deployment (recommended)"},{"location":"deployment/worker/#manual-worker-deployment","text":"If you want to start a worker manually, you can use the following command: $ hq worker start Each worker will be assigned a unique ID that you can use in later commands to query information about the worker or to stop it. By default, the worker will try to connect to a server using the default server directory . If you want to connect to a different server, use the --server-dir option. Sharing the server directory When you start a worker, it will need to read the server directory to find out how to connect to the server. The directory thus has to be accesible both by the server and the worker machines. On HPC clusters, it is common that login nodes and compute nodes use a shared filesystem, so this shouldn't be a problem. However, if a shared filesystem is not available on your cluster, you can just copy the server directory from the server machine to the worker machine and access it from there. The worker machine still has to be able to initiate a TCP/IP connection to the server machine though.","title":"Manual worker deployment"},{"location":"deployment/worker/#deploying-a-worker-using-pbsslurm","text":"If you want to manually start a worker using PBS or Slurm, simply use the corresponding submit command ( qsub or sbatch ) and run the hq worker start command inside the allocated job. If you want to start a worker on each allocated node, you can run this command on each node using e.g. mpirun . Example submission script: PBS Slurm #!/bin/bash #PBS -q # Run a worker on the main node //hq worker start --manager pbs # Run a worker on all allocated nodes ml OpenMPI pbsdsh //hq worker start --manager pbs #!/bin/bash #SBATCH --partition # Run a worker on the main node //hq worker start --manager slurm # Run a worker on all allocated nodes ml OpenMPI srun --overlap //hq worker start --manager slurm The worker will try to automatically detect that it is started under a PBS/Slurm job, but you can also explicitly pass the option --manager to tell the worker that it should expect a specific environment.","title":"Deploying a worker using PBS/Slurm"},{"location":"deployment/worker/#stopping-workers","text":"If you have started a worker manually, and you want to stop it, you can use the hq worker stop command 1 : $ hq worker stop ","title":"Stopping workers"},{"location":"deployment/worker/#time-limit","text":"HyperQueue workers are designed to be volatile, i.e. it is expected that they will be stopped from time to time, because they are often started inside PBS/Slurm jobs that have a limited duration. It is very useful for the workers to know how much remaining time (\"lifetime\") do they have until they will be stopped. This duration is called the Worker time limit . When a worker is started manually inside a PBS or Slurm job, it will automatically calculate the time limit from the job's metadata. If you want to set time limit for workers started outside of PBS/Slurm jobs or if you want to override the detected settings, you can use the --time-limit= option 2 when starting the worker. When the time limit is reached, the worker is automatically terminated. The time limit of a worker affects what tasks can be scheduled to it. For example, a task submitted with --time-request 10m will not be scheduled onto a worker that only has a remaining time limit of 5 minutes.","title":"Time limit"},{"location":"deployment/worker/#idle-timeout","text":"When you deploy HQ workers inside a PBS or Slurm job, keeping the worker alive will drain resources from your accounting project (unless you use a free queue). If a worker has nothing to do, it might be better to terminate it sooner to avoid paying these costs for no reason. You can achieve this using Worker idle timeout . If you use it, the worker will automatically stop if it receives no task to compute for the specified duration. For example, if you set the idle duration to five minutes, the worker will stop once it hadn't received any task to compute for five minutes. You can set the idle timeout using the --idle-timeout option 2 when starting the worker. Tip Workers started automatically have the idle timeout set to five minutes. Idle timeout can also be configured globally for all workers using the --idle-timeout option when starting a server: $ hq server start --idle-timeout = This value will be then used for each worker that does not explicitly specify its own idle timeout.","title":"Idle timeout"},{"location":"deployment/worker/#worker-state","text":"Each worker can be in one of the following states: Running Worker is running and is able to process tasks Connection lost Worker lost connection to the server. Probably someone manually killed the worker or job walltime in its PBS/Slurm job was reached . Heartbeat lost Communication between server and worker was interrupted. It usually signifies a network problem or a hardware crash of the computational node. Stopped Worker was stopped . Idle timeout Worker was terminated due to Idle timeout .","title":"Worker state"},{"location":"deployment/worker/#lost-connection-to-the-server","text":"The behavior of what should happen with a worker that lost its connection to the server is configured via hq worker start --on-server-lost= . You can select from two policies: stop - The worker immediately terminates and kills all currently running tasks. finish-running - The worker does not start to execute any new tasks, but it tries to finish tasks that are already running. When all such tasks finish, the worker will terminate. stop is the default policy when a worker is manually started by hq worker start . When a worker is started by the automatic allocator , then finish-running is used as the default value.","title":"Lost connection to the server"},{"location":"deployment/worker/#useful-worker-commands","text":"Here is a list of useful worker commands:","title":"Useful worker commands"},{"location":"deployment/worker/#display-worker-list","text":"This command will display a list of workers that are currently connected to the server: $ hq worker list If you also want to include workers that are offline (i.e. that have crashed or disconnected in the past), pass the --all flag to the list command.","title":"Display worker list"},{"location":"deployment/worker/#display-information-about-a-specific-worker","text":"$ hq worker info ","title":"Display information about a specific worker"},{"location":"deployment/worker/#worker-groups","text":"Each worker is a member exactly of one group. Groups are used when multi-node tasks are used. See more here You can use various shortcuts to select multiple workers at once. \u21a9 You can use various shortcuts for the duration value. \u21a9 \u21a9","title":"Worker groups"},{"location":"examples/","text":"Examples # Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line interface and also with the Python API. You can view these examples either in the documentation or on GitHub . Iterative computation","title":"Examples"},{"location":"examples/#examples","text":"Here you can find several examples of how HyperQueue can be used for various use-cases, both with the command-line interface and also with the Python API. You can view these examples either in the documentation or on GitHub . Iterative computation","title":"Examples"},{"location":"examples/iterative-computation/","text":"Iterative computation # It is a common use-case to perform an iterative computation, e.g. run a randomized simulation until the results are stable/accurate enough, or train a machine learning model while the loss keeps dropping. While there is currently no built-in support in HQ for iteratively submitting new tasks to an existing job, you can perform an iterative computation relatively easily with the following approach: Submit a HQ job that performs a computation Wait for the job to finish Read the output of the job and decide if computation should continue If yes, go to 1. Python API # With the Python API, we can simply write the outermost iteration loop in Python, and repeatedly submit jobs, until some end criterion has been achieved: from hyperqueue import Job , Client client = Client () while True : job = Job () job . program ([ \"my-program\" ], stdout = \"out.txt\" ) # Submit a job submitted = client . submit ( job ) # Wait for it to complete client . wait_for_jobs ([ submitted ]) # Read the output of the job with open ( \"out.txt\" ) as f : # Check some termination condition and eventually end the loop if f . read () . strip () == \"done\" : break Command-line interface # With the command-line interface, you can perform the iterative loop e.g. in Bash. #!/bin/bash while : do # Submit a job and wait for it to complete ./hq submit --wait ./compute.sh # Read the output of the job output = $( ./hq job cat last stdout ) # Decide if we should end or continue if [ \" ${ output } \" -eq 0 ] ; then break fi done","title":"Iterative computation"},{"location":"examples/iterative-computation/#iterative-computation","text":"It is a common use-case to perform an iterative computation, e.g. run a randomized simulation until the results are stable/accurate enough, or train a machine learning model while the loss keeps dropping. While there is currently no built-in support in HQ for iteratively submitting new tasks to an existing job, you can perform an iterative computation relatively easily with the following approach: Submit a HQ job that performs a computation Wait for the job to finish Read the output of the job and decide if computation should continue If yes, go to 1.","title":"Iterative computation"},{"location":"examples/iterative-computation/#python-api","text":"With the Python API, we can simply write the outermost iteration loop in Python, and repeatedly submit jobs, until some end criterion has been achieved: from hyperqueue import Job , Client client = Client () while True : job = Job () job . program ([ \"my-program\" ], stdout = \"out.txt\" ) # Submit a job submitted = client . submit ( job ) # Wait for it to complete client . wait_for_jobs ([ submitted ]) # Read the output of the job with open ( \"out.txt\" ) as f : # Check some termination condition and eventually end the loop if f . read () . strip () == \"done\" : break","title":"Python API"},{"location":"examples/iterative-computation/#command-line-interface","text":"With the command-line interface, you can perform the iterative loop e.g. in Bash. #!/bin/bash while : do # Submit a job and wait for it to complete ./hq submit --wait ./compute.sh # Read the output of the job output = $( ./hq job cat last stdout ) # Decide if we should end or continue if [ \" ${ output } \" -eq 0 ] ; then break fi done","title":"Command-line interface"},{"location":"jobs/arrays/","text":"It is a common use case to execute the same command for multiple input parameters, for example: Perform a simulation for each input file in a directory or for each line in a CSV file. Train many machine learning models using hyperparameter search for each model configuration. HyperQueue allows you to do this using a job that contains many tasks. We call such jobs Task arrays . You can create a task array with a single submit command and then manage all created tasks as a single group using its containing job. Note Task arrays are somewhat similar to \"job arrays\" used by PBS and Slurm. However, HQ does not use PBS/Slurm job arrays for implementing this feature. Therefore, the limits that are commonly enforced on job arrays on HPC clusters do not apply to HyperQueue task arrays. Creating task arrays # To create a task array, you must provide some source that will determine how many tasks should be created and what inputs (environment variables) should be passed to each task so that you can differentiate them. Currently, you can create a task array from a range of integers , from each line of a text file or from each item of a JSON array . You cannot combine these sources, as they are mutually exclusive. Handling many output files By default, each task in a task array will create two output files (containing stdout and stderr output). Creating large task arrays will thus generate a lot of files, which can be problematic especially on network-based shared filesystems, such as Lustre. To avoid this, you can either disable the output or use Output streaming . Integer range # The simplest way of creating a task array is to specify an integer range. A task will be started for each integer in the range. You can then differentiate between the individual tasks using task id that can be accessed through the HQ_TASK_ID environment variable . You can enter the range as two unsigned numbers separated by a dash 1 , where the first number should be smaller than the second one. The range is inclusive. The range is entered using the --array option: # Task array with 3 tasks, with ids 1, 2, 3 $ hq submit --array 1 -3 ... # Task array with 6 tasks, with ids 0, 2, 4, 6, 8, 10 $ hq submit --array 0 -10:2 ... Lines of a file # Another way of creating a task array is to provide a text file with multiple lines. Each line from the file will be passed to a separate task, which can access the value of the line using the environment variable HQ_ENTRY . This is useful if you want to e.g. process each file inside some directory. You can generate a text file that will contain each filepath on a separate line and then pass it to the submit command using the --each-line option: $ hq submit --each-line entries.txt ... Tip To directly use an environment variable in the submitted command, you have to make sure that it will be expanded when the command is executed, not when the command is submitted. You should also execute the command in a bash script if you want to specify it directly and not via a script file. For example, the following command is incorrect , as it will expand HQ_ENTRY during submission (probably to an empty string) and submit a command ls : $ hq submit --each-line files.txt ls $HQ_ENTRY To actually submit the command ls $HQ_ENTRY , you can e.g. wrap the command in apostrophes and run it in a shell: $ hq submit --each-line files.txt bash -c 'ls $HQ_ENTRY' JSON array # You can also specify the source using a JSON array stored inside a file. HyperQueue will then create a task for each item in the array and pass the item as a JSON string to the corresponding task using the environment variable HQ_ENTRY . Note The root JSON value stored inside the file must be an array. You can create a task array in this way using the --from-json option: $ hq submit --from-json items.json ... If items.json contained this content: [{ \"batch_size\" : 4 , \"learning_rate\" : 0.01 }, { \"batch_size\" : 8 , \"learning_rate\" : 0.001 }] then HyperQueue would create two tasks, one with HQ_ENTRY set to {\"batch_size\": 4, \"learning_rate\": 0.01} and the other with HQ_ENTRY set to {\"batch_size\": 8, \"learning_rate\": 0.001} . Combining with --each-line / --from-json with --array # Option --each-line or --from-json can be combined with option --array . In such case, only a subset of lines/json will be submitted. If --array defines an ID that exceeds the number of lines in the file (or the number of elements in JSON), then the ID is silently removed. For example: $ hq submit --each-line input.txt --array \"2, 8-10\" If input.txt has sufficiently many lines then it will create array job with four tasks. One for 3rd line of file and three tasks for 9th-11th line (note that first line has id 0). It analogously works for --from-json . The full syntax can be seen in the second selector of the ID selector shortcut . \u21a9","title":"Task Arrays"},{"location":"jobs/arrays/#creating-task-arrays","text":"To create a task array, you must provide some source that will determine how many tasks should be created and what inputs (environment variables) should be passed to each task so that you can differentiate them. Currently, you can create a task array from a range of integers , from each line of a text file or from each item of a JSON array . You cannot combine these sources, as they are mutually exclusive. Handling many output files By default, each task in a task array will create two output files (containing stdout and stderr output). Creating large task arrays will thus generate a lot of files, which can be problematic especially on network-based shared filesystems, such as Lustre. To avoid this, you can either disable the output or use Output streaming .","title":"Creating task arrays"},{"location":"jobs/arrays/#integer-range","text":"The simplest way of creating a task array is to specify an integer range. A task will be started for each integer in the range. You can then differentiate between the individual tasks using task id that can be accessed through the HQ_TASK_ID environment variable . You can enter the range as two unsigned numbers separated by a dash 1 , where the first number should be smaller than the second one. The range is inclusive. The range is entered using the --array option: # Task array with 3 tasks, with ids 1, 2, 3 $ hq submit --array 1 -3 ... # Task array with 6 tasks, with ids 0, 2, 4, 6, 8, 10 $ hq submit --array 0 -10:2 ...","title":"Integer range"},{"location":"jobs/arrays/#lines-of-a-file","text":"Another way of creating a task array is to provide a text file with multiple lines. Each line from the file will be passed to a separate task, which can access the value of the line using the environment variable HQ_ENTRY . This is useful if you want to e.g. process each file inside some directory. You can generate a text file that will contain each filepath on a separate line and then pass it to the submit command using the --each-line option: $ hq submit --each-line entries.txt ... Tip To directly use an environment variable in the submitted command, you have to make sure that it will be expanded when the command is executed, not when the command is submitted. You should also execute the command in a bash script if you want to specify it directly and not via a script file. For example, the following command is incorrect , as it will expand HQ_ENTRY during submission (probably to an empty string) and submit a command ls : $ hq submit --each-line files.txt ls $HQ_ENTRY To actually submit the command ls $HQ_ENTRY , you can e.g. wrap the command in apostrophes and run it in a shell: $ hq submit --each-line files.txt bash -c 'ls $HQ_ENTRY'","title":"Lines of a file"},{"location":"jobs/arrays/#json-array","text":"You can also specify the source using a JSON array stored inside a file. HyperQueue will then create a task for each item in the array and pass the item as a JSON string to the corresponding task using the environment variable HQ_ENTRY . Note The root JSON value stored inside the file must be an array. You can create a task array in this way using the --from-json option: $ hq submit --from-json items.json ... If items.json contained this content: [{ \"batch_size\" : 4 , \"learning_rate\" : 0.01 }, { \"batch_size\" : 8 , \"learning_rate\" : 0.001 }] then HyperQueue would create two tasks, one with HQ_ENTRY set to {\"batch_size\": 4, \"learning_rate\": 0.01} and the other with HQ_ENTRY set to {\"batch_size\": 8, \"learning_rate\": 0.001} .","title":"JSON array"},{"location":"jobs/arrays/#combining-with-each-line-from-json-with-array","text":"Option --each-line or --from-json can be combined with option --array . In such case, only a subset of lines/json will be submitted. If --array defines an ID that exceeds the number of lines in the file (or the number of elements in JSON), then the ID is silently removed. For example: $ hq submit --each-line input.txt --array \"2, 8-10\" If input.txt has sufficiently many lines then it will create array job with four tasks. One for 3rd line of file and three tasks for 9th-11th line (note that first line has id 0). It analogously works for --from-json . The full syntax can be seen in the second selector of the ID selector shortcut . \u21a9","title":"Combining with --each-line/--from-json with --array"},{"location":"jobs/cresources/","text":"CPU resource management # Note In this text, we use the term CPU for a resource that is provided by the operating system (e.g. what you get from /proc/cpuinfo ). In this meaning, it is usually a core of a physical CPU. In the text related to NUMA we use the term socket to refer to physical CPUs. Brief introduction # HyperQueue allows you to select how many CPU cores will be allocated for each task. By default, each task requires a single CPU of the worker's node. This can be changed by the flag --cpus . For example, to submit a job with a task that requires 8 CPUs: $ hq submit --cpus = 8 This ensures that HyperQueue will exclusively reserve 8 CPUs for this task when it is started. This task would thus never be scheduled on a worker that has less than 8 CPUs. Note that this reservation exists on a logical level only. To ensure more direct mapping to physical cores, see pinning below. CPUs are a resource # From version 0.13.0, CPUs are managed as any other resource under name \"cpus\", with the following additions: If a task does not explicitly specify the number of cpus, then it requests 1 CPU as default. CPUs request can be specified by hq submit --cpus=X ... where --cpus=X is a shortcut for --resource cpus=X , and X can be all valid requests for a resource, including values like all or 8 compact! . (More in Resource Management ). A task may be automatically pinned to a given CPUs (see pinning ). There are some extra environmental variables for CPUs (see below). CPUs are automatically detected. See below for information about NUMA or Hyper Threading. CPUs provided by a worker can be explicitly specified via --cpus , see below. CPU related environment variables # The following variables are created when a task is executed: HQ_CPUS - List of cores assigned to a task. (this is an alias for HQ_RESOURCE_VALUES_cpus ). HQ_PIN - Is set to taskset or omp (depending on the used pin mode) if the task was pinned by HyperQueue (see below). NUM_OMP_THREADS -- Set to number of cores assigned for task. (For compatibility with OpenMP). This option is not set when you ask for a non-integer number of CPUs. Pinning # By default, HQ internally allocates CPUs on a logical level. In other words, HQ ensures that the sum of requests of concurrently running tasks does not exceed the number of CPUs of the worker, but process assignment to cores is left to the system scheduler, which may move processes across CPUs as it wants. If this is not desired, especially in the case of NUMA, processes could be pinned, either manually or automatically. Automatic pinning # HyperQueue can pin threads using two ways: with taskset or by setting OpenMP environment variables. You can use the --pin flag to choose between these two modes. taskset OpenMP $ hq submit --pin taskset --cpus = 8 will cause HyperQueue to execute your program like this: taskset -c \"\" ` $ hq submit --pin omp --cpus = 8 will cause HyperQueue to execute your program like this: OMP_PROC_BIND = close OMP_PLACES = \"{}\" If any automatic pinning mode is enabled, the environment variable HQ_PIN will be set. Manual pinning # If you want to gain full control over core pinning, you may pin the process by yourself. The assigned CPUs are stored in the environment variable HQ_CPUS as a comma-delimited list of CPU IDs. You can use utilities such as taskset or numactl and pass them HQ_CPUS to pin a process to these CPUs. Warning If you manually pin your processes, do not also use the --pin flag of the submit command. It may have some unwanted interferences. Below you can find an example of a script file that pins the executed process manually using taskset and numactl : taskset numactl #!/bin/bash taskset -c $HQ_CPUS #!/bin/bash numactl -C $HQ_CPUS If you submit this script with hq submit --cpus=4 script.sh , it will pin your program to 4 CPUs allocated by HQ. NUMA allocation strategy # Workers automatically detect the number of CPUs and on Linux systems they also detect their partitioning into sockets. When a NUMA architecture is automatically detected, indexed resource with groups is used for resource \"cpus\". You can then use allocation strategies for groups to specify how sockets are allocated. They follow the same rules as normal allocation strategies; for clarity we are rephrasing the group allocation strategies in terms of cores and sockets: Compact ( compact ) - Tries to allocate cores on as few sockets as possible in the current worker state. $ hq submit --cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocates cores on as few sockets as possible for a target node. The task will not be executed until the requirement could be fully fulfilled. For example, if your worker has 4 cores per socket, and you ask for 4 CPUs, it will always be executed on a single socket. If you ask for 8 CPUs, it will always be executed on two sockets. $ hq submit --cpus = \"8 compact!\" ... Tip You might encounter a problem in your shell when you try to specify the strict compact policy, because the definition contains an exclamation mark ( ! ). In that case, try to wrap the policy in single quotes, like this: $ hq submit --cpus = '8 compact!' ... Scatter ( scatter ) - Allocate cores across as many sockets possible, based on the currently available cores of a worker. If your worker has 4 sockets with 8 cores per socket, and you ask for 8 CPUs, then HQ will try to run the process with 2 CPUs on each socket, if possible given the currently available worker cores. $ hq submit --cpus = \"8 scatter\" ... The default policy is the compact policy, i.e. --cpus= is equivalent to --cpus=\" compact\" . Note Specifying a policy only has an effect if you have more than one socket (physical CPUs). In case of a single socket, policies are indistinguishable. CPU configuration # Each worker will automatically detect the number of CPUs available. On Linux systems, it will also detect the partitioning into sockets (NUMA configuration). In most cases, it should work out of the box. If you want to see how will a HQ worker see your CPU configuration without actually starting the worker, you can use the hq worker hwdetect command, which will print the detected CPU configuration. Manual specification of CPU configuration # If the automatic detection fails for some reason, or you want to manually configure the CPU configuration, you can use the --cpus flag when starting a worker. It is an alias for --resource cpus=... (More in Resource Management ), except it also allow to define --cpus=N where N is an integer; it is then interpreted as 1xN in the resource definition. Below there are some examples of configuration that you can specify: Worker with 8 CPUs and a single socket. $ hq worker start --cpus = 8 Worker with 2 sockets with 12 cores per socket. $ hq worker start --cpus = 2x12 Manually specify that the worker should use the following core ids and how they are organized into sockets. In this example, two sockets are defined, one with 3 cores and one with 2 cores. $ hq worker start --cpus =[[ 2 , 3 , 4 ] , [ 10 , 14 ]] Disable Hyper Threading # If you want to detect CPUs but ignore HyperThreading then --no-hyper-threading flag can be used. It will detect only the first virtual core of each physical core. Example: $ hq worker start --no-hyper-threading","title":"CPU Resources"},{"location":"jobs/cresources/#cpu-resource-management","text":"Note In this text, we use the term CPU for a resource that is provided by the operating system (e.g. what you get from /proc/cpuinfo ). In this meaning, it is usually a core of a physical CPU. In the text related to NUMA we use the term socket to refer to physical CPUs.","title":"CPU resource management"},{"location":"jobs/cresources/#brief-introduction","text":"HyperQueue allows you to select how many CPU cores will be allocated for each task. By default, each task requires a single CPU of the worker's node. This can be changed by the flag --cpus . For example, to submit a job with a task that requires 8 CPUs: $ hq submit --cpus = 8 This ensures that HyperQueue will exclusively reserve 8 CPUs for this task when it is started. This task would thus never be scheduled on a worker that has less than 8 CPUs. Note that this reservation exists on a logical level only. To ensure more direct mapping to physical cores, see pinning below.","title":"Brief introduction"},{"location":"jobs/cresources/#cpus-are-a-resource","text":"From version 0.13.0, CPUs are managed as any other resource under name \"cpus\", with the following additions: If a task does not explicitly specify the number of cpus, then it requests 1 CPU as default. CPUs request can be specified by hq submit --cpus=X ... where --cpus=X is a shortcut for --resource cpus=X , and X can be all valid requests for a resource, including values like all or 8 compact! . (More in Resource Management ). A task may be automatically pinned to a given CPUs (see pinning ). There are some extra environmental variables for CPUs (see below). CPUs are automatically detected. See below for information about NUMA or Hyper Threading. CPUs provided by a worker can be explicitly specified via --cpus , see below.","title":"CPUs are a resource"},{"location":"jobs/cresources/#cpu-related-environment-variables","text":"The following variables are created when a task is executed: HQ_CPUS - List of cores assigned to a task. (this is an alias for HQ_RESOURCE_VALUES_cpus ). HQ_PIN - Is set to taskset or omp (depending on the used pin mode) if the task was pinned by HyperQueue (see below). NUM_OMP_THREADS -- Set to number of cores assigned for task. (For compatibility with OpenMP). This option is not set when you ask for a non-integer number of CPUs.","title":"CPU related environment variables"},{"location":"jobs/cresources/#pinning","text":"By default, HQ internally allocates CPUs on a logical level. In other words, HQ ensures that the sum of requests of concurrently running tasks does not exceed the number of CPUs of the worker, but process assignment to cores is left to the system scheduler, which may move processes across CPUs as it wants. If this is not desired, especially in the case of NUMA, processes could be pinned, either manually or automatically.","title":"Pinning"},{"location":"jobs/cresources/#automatic-pinning","text":"HyperQueue can pin threads using two ways: with taskset or by setting OpenMP environment variables. You can use the --pin flag to choose between these two modes. taskset OpenMP $ hq submit --pin taskset --cpus = 8 will cause HyperQueue to execute your program like this: taskset -c \"\" ` $ hq submit --pin omp --cpus = 8 will cause HyperQueue to execute your program like this: OMP_PROC_BIND = close OMP_PLACES = \"{}\" If any automatic pinning mode is enabled, the environment variable HQ_PIN will be set.","title":"Automatic pinning"},{"location":"jobs/cresources/#manual-pinning","text":"If you want to gain full control over core pinning, you may pin the process by yourself. The assigned CPUs are stored in the environment variable HQ_CPUS as a comma-delimited list of CPU IDs. You can use utilities such as taskset or numactl and pass them HQ_CPUS to pin a process to these CPUs. Warning If you manually pin your processes, do not also use the --pin flag of the submit command. It may have some unwanted interferences. Below you can find an example of a script file that pins the executed process manually using taskset and numactl : taskset numactl #!/bin/bash taskset -c $HQ_CPUS #!/bin/bash numactl -C $HQ_CPUS If you submit this script with hq submit --cpus=4 script.sh , it will pin your program to 4 CPUs allocated by HQ.","title":"Manual pinning"},{"location":"jobs/cresources/#numa-allocation-strategy","text":"Workers automatically detect the number of CPUs and on Linux systems they also detect their partitioning into sockets. When a NUMA architecture is automatically detected, indexed resource with groups is used for resource \"cpus\". You can then use allocation strategies for groups to specify how sockets are allocated. They follow the same rules as normal allocation strategies; for clarity we are rephrasing the group allocation strategies in terms of cores and sockets: Compact ( compact ) - Tries to allocate cores on as few sockets as possible in the current worker state. $ hq submit --cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocates cores on as few sockets as possible for a target node. The task will not be executed until the requirement could be fully fulfilled. For example, if your worker has 4 cores per socket, and you ask for 4 CPUs, it will always be executed on a single socket. If you ask for 8 CPUs, it will always be executed on two sockets. $ hq submit --cpus = \"8 compact!\" ... Tip You might encounter a problem in your shell when you try to specify the strict compact policy, because the definition contains an exclamation mark ( ! ). In that case, try to wrap the policy in single quotes, like this: $ hq submit --cpus = '8 compact!' ... Scatter ( scatter ) - Allocate cores across as many sockets possible, based on the currently available cores of a worker. If your worker has 4 sockets with 8 cores per socket, and you ask for 8 CPUs, then HQ will try to run the process with 2 CPUs on each socket, if possible given the currently available worker cores. $ hq submit --cpus = \"8 scatter\" ... The default policy is the compact policy, i.e. --cpus= is equivalent to --cpus=\" compact\" . Note Specifying a policy only has an effect if you have more than one socket (physical CPUs). In case of a single socket, policies are indistinguishable.","title":"NUMA allocation strategy"},{"location":"jobs/cresources/#cpu-configuration","text":"Each worker will automatically detect the number of CPUs available. On Linux systems, it will also detect the partitioning into sockets (NUMA configuration). In most cases, it should work out of the box. If you want to see how will a HQ worker see your CPU configuration without actually starting the worker, you can use the hq worker hwdetect command, which will print the detected CPU configuration.","title":"CPU configuration"},{"location":"jobs/cresources/#manual-specification-of-cpu-configuration","text":"If the automatic detection fails for some reason, or you want to manually configure the CPU configuration, you can use the --cpus flag when starting a worker. It is an alias for --resource cpus=... (More in Resource Management ), except it also allow to define --cpus=N where N is an integer; it is then interpreted as 1xN in the resource definition. Below there are some examples of configuration that you can specify: Worker with 8 CPUs and a single socket. $ hq worker start --cpus = 8 Worker with 2 sockets with 12 cores per socket. $ hq worker start --cpus = 2x12 Manually specify that the worker should use the following core ids and how they are organized into sockets. In this example, two sockets are defined, one with 3 cores and one with 2 cores. $ hq worker start --cpus =[[ 2 , 3 , 4 ] , [ 10 , 14 ]]","title":"Manual specification of CPU configuration"},{"location":"jobs/cresources/#disable-hyper-threading","text":"If you want to detect CPUs but ignore HyperThreading then --no-hyper-threading flag can be used. It will detect only the first virtual core of each physical core. Example: $ hq worker start --no-hyper-threading","title":"Disable Hyper Threading"},{"location":"jobs/directives/","text":"Directives # You can specify job parameters using special comments ( directives ) specified in a submitted shell script. Directives are lines that begin with the #HQ prefix. Any text following this prefix will be interpreted as a command line argument for hq submit . Example directive file # Suppose that script.sh has the following content: #!/bin/bash #HQ --name=Example #HQ --cpus=\"2 compact\" --pin taskset ./my-program If you execute $ hq submit script.sh it will behave as if you have executed $ hq submit --name = Example --cpus = \"2 compact\" --pin taskset script.sh Directives mode # You can select three modes using the --directives flag of hq submit . The mode will determine when should HyperQueue attempt to parse directives from the provided command. auto (default) - Directives will be parsed if the first command passed to hq submit has the .sh extension. file - Directives will be parsed from the first command passed to hq submit . stdin - Directives will be parsed from stdin (see --stdin ) off - Directives will not be parsed. Tip When HQ parses directives from a file, it will also try to parse a shebang line from the script and use it to select an interpreter for running the script. Notes # Directives have to be defined at the beginning of the file. Only comments or empty lines are allowed to precede the directives. Directives have to be defined in the first 32KiB of the file, the rest of the file is ignored. Parameters set via CLI have precedence over parameters set via direectives: Parameters that cannot occur multiple times (like --name ) will be overriden by values set from CLI. Parameters that can occur multiple times (like --resource ) will be combined from CLI and from directives. A script may contain more lines with the #HQ prefix, such lines are combined and evaluated as a continuous list of parameters.","title":"Directives"},{"location":"jobs/directives/#directives","text":"You can specify job parameters using special comments ( directives ) specified in a submitted shell script. Directives are lines that begin with the #HQ prefix. Any text following this prefix will be interpreted as a command line argument for hq submit .","title":"Directives"},{"location":"jobs/directives/#example-directive-file","text":"Suppose that script.sh has the following content: #!/bin/bash #HQ --name=Example #HQ --cpus=\"2 compact\" --pin taskset ./my-program If you execute $ hq submit script.sh it will behave as if you have executed $ hq submit --name = Example --cpus = \"2 compact\" --pin taskset script.sh","title":"Example directive file"},{"location":"jobs/directives/#directives-mode","text":"You can select three modes using the --directives flag of hq submit . The mode will determine when should HyperQueue attempt to parse directives from the provided command. auto (default) - Directives will be parsed if the first command passed to hq submit has the .sh extension. file - Directives will be parsed from the first command passed to hq submit . stdin - Directives will be parsed from stdin (see --stdin ) off - Directives will not be parsed. Tip When HQ parses directives from a file, it will also try to parse a shebang line from the script and use it to select an interpreter for running the script.","title":"Directives mode"},{"location":"jobs/directives/#notes","text":"Directives have to be defined at the beginning of the file. Only comments or empty lines are allowed to precede the directives. Directives have to be defined in the first 32KiB of the file, the rest of the file is ignored. Parameters set via CLI have precedence over parameters set via direectives: Parameters that cannot occur multiple times (like --name ) will be overriden by values set from CLI. Parameters that can occur multiple times (like --resource ) will be combined from CLI and from directives. A script may contain more lines with the #HQ prefix, such lines are combined and evaluated as a continuous list of parameters.","title":"Notes"},{"location":"jobs/failure/","text":"In distributed systems, failure is inevitable. This sections describes how HyperQueue handles various types of failures and how can you affect its behavior. Resubmitting array jobs # When a job fails or is canceled, you can submit it again. However, in case of task arrays , different tasks may end in different states, and often we want to recompute only tasks with a specific status (e.g. failed tasks). By following combination of commands you may recompute only failed tasks. Let us assume that we want to recompute all failed tasks in job 5: $ hq submit --array=`hq job task-ids 5 --filter=failed` ./my-computation It works as follows: Command hq job task-ids 5 --filter=failed returns IDs of failed jobs of job 5 , and we set it to --array parameter that starts only tasks for given IDs. If we want to recompute all failed tasks and all canceled tasks we can do it as follows: $ hq submit --array=`hq job task-ids 5 --filter=failed,canceled` ./my-computation Note that it also works with --each-line or --from-json , i.e.: # Original computation $ hq submit --each-line=input.txt ./my-computation # Resubmitting failed jobs $ hq submit --each-line=input.txt --array=`hq job task-ids last --filter=failed` ./my-computation Task restart # Sometimes a worker might crash while it is executing some task. In that case the server will automatically reschedule that task to a different worker and the task will begin executing from the beginning. In order to let the executed application know that the same task is being executed repeatedly, HyperQueue assigns each execution a separate Instance ID . It is a 32b non-negative number that identifies each (re-)execution of a task. It is guaranteed that a newer execution of a task will have a larger instance ID, however HyperQueue explicitly does not guarantee any specific values or differences between two IDs. Each instance ID is valid only for a particular task. Two different tasks may have the same instance ID. Instance IDs can be useful e.g. when a task is restarted, and you want to distinguish the output of the first execution and the restarted execution (by default, HQ will overwrite the standard output/error file of the first execution). You can instead create a separate stdout/stderr file for each task execution using the instance ID placeholder . Task array failures # By default, when a single task of a task array fails, the computation of the job will continue. You can change this behavior with the --max-fails= option of the submit command, where X is non-negative integer. If specified, once more tasks than X tasks fail, the rest of the job's tasks that were not completed yet will be canceled. For example: $ hq submit --array 1-1000 --max-fails 5 ... This will create a task array with 1000 tasks. Once 5 or more tasks fail, the remaining uncompleted tasks of the job will be canceled.","title":"Handling Failure"},{"location":"jobs/failure/#resubmitting-array-jobs","text":"When a job fails or is canceled, you can submit it again. However, in case of task arrays , different tasks may end in different states, and often we want to recompute only tasks with a specific status (e.g. failed tasks). By following combination of commands you may recompute only failed tasks. Let us assume that we want to recompute all failed tasks in job 5: $ hq submit --array=`hq job task-ids 5 --filter=failed` ./my-computation It works as follows: Command hq job task-ids 5 --filter=failed returns IDs of failed jobs of job 5 , and we set it to --array parameter that starts only tasks for given IDs. If we want to recompute all failed tasks and all canceled tasks we can do it as follows: $ hq submit --array=`hq job task-ids 5 --filter=failed,canceled` ./my-computation Note that it also works with --each-line or --from-json , i.e.: # Original computation $ hq submit --each-line=input.txt ./my-computation # Resubmitting failed jobs $ hq submit --each-line=input.txt --array=`hq job task-ids last --filter=failed` ./my-computation","title":"Resubmitting array jobs"},{"location":"jobs/failure/#task-restart","text":"Sometimes a worker might crash while it is executing some task. In that case the server will automatically reschedule that task to a different worker and the task will begin executing from the beginning. In order to let the executed application know that the same task is being executed repeatedly, HyperQueue assigns each execution a separate Instance ID . It is a 32b non-negative number that identifies each (re-)execution of a task. It is guaranteed that a newer execution of a task will have a larger instance ID, however HyperQueue explicitly does not guarantee any specific values or differences between two IDs. Each instance ID is valid only for a particular task. Two different tasks may have the same instance ID. Instance IDs can be useful e.g. when a task is restarted, and you want to distinguish the output of the first execution and the restarted execution (by default, HQ will overwrite the standard output/error file of the first execution). You can instead create a separate stdout/stderr file for each task execution using the instance ID placeholder .","title":"Task restart"},{"location":"jobs/failure/#task-array-failures","text":"By default, when a single task of a task array fails, the computation of the job will continue. You can change this behavior with the --max-fails= option of the submit command, where X is non-negative integer. If specified, once more tasks than X tasks fail, the rest of the job's tasks that were not completed yet will be canceled. For example: $ hq submit --array 1-1000 --max-fails 5 ... This will create a task array with 1000 tasks. Once 5 or more tasks fail, the remaining uncompleted tasks of the job will be canceled.","title":"Task array failures"},{"location":"jobs/jobfile/","text":"Job Definition File # Job Definition File (JDF) a way how to submit a complex pipeline into a HyperQueue. It is a TOML file that describes tasks of a job. JDF provides all functionalities as command line interface of HyperQueue and also adds access to additional features: Heterogeneous tasks -- Job may be composed of different tasks Dependencies -- Tasks may have dependencies Resource request alternatives -- Task may have alternative resource requests, e.g.: 4 cpus OR 1 cpus and 1 gpu Note that these features are also available through Python interface. Minimal example # First, we create file with the following content: [[task]] command = [ \"sleep\" , \"1\" ] Let us assume that we have named this file as myfile.toml , then we can run the following command to submit a job: $ hq job submit-file myfile.toml The effect will be same as running: $ hq submit sleep 1 Task configuration # The following shows how job and task may be configured in more detail. All options except command are optional. If not said otherwise, an option in format xxx = ... is an equivalent of --xxx = ... in hq submit command. The default are the same as CLI interface. name = \"test-job\" stream = \"path/to/stream/dir\" # Stdout/Stderr streaming (see --stream) max_fails = 11 [[task]] stdout = \"testout-%{TASK_ID}\" stderr = { path = \"testerr-%{TASK_ID}\" , mode = \"rm-if-finished\" } task_dir = true time_limit = \"1m 10s\" priority = -1 crash_limit = 12 command = [ \"/bin/bash\" , \"-c\" , \"echo $ABC\" ] # Environment variables env = { \"ABC\" = \"123\" , \"XYZ\" = \"aaaa\" } # Content that will be written on stdin stdin = \"Hello world!\" [[task.request]] resources = { \"cpus\" = \"4 compact!\" , \"gpus\" = 2 } time_request = \"10s\" More tasks # More tasks with different configuration may be defined as follows: [[task]] command = [ \"sleep\" , \"1\" ] [[task]] command = [ \"sleep\" , \"2\" ] [[task]] command = [ \"sleep\" , \"3\" ] In the case above, tasks are given automatic task ids from id 0. You can also specify IDs manually: [[task]] id = 10 command = [ \"sleep\" , \"1\" ] [[task]] id = 11 command = [ \"sleep\" , \"2\" ] [[task]] id = 2 command = [ \"sleep\" , \"3\" ] Task arrays # If you want to create uniform tasks you can define task array (similar to --array ): [[array]] ids = \"1,2,50-100\" command = [ \"sleep\" , \"1\" ] You can also specify array with content of HQ_ENTRIES : [[array]] entries = [ \"One\" , \"Two\" , \"Three\" ] command = [ \"sleep\" , \"1\" ] Note Options entries and ids can be used together. Task dependencies # Job Definition File allows to define a dependencies between tasks. In other words, it means that the task may be executed only if the previous tasks are already finished. The task's option deps defines on which tasks the given task dependents. The task is addressed by their IDs. The following example creates three tasks where the third task depends on the first two tasks. [[task]] id = 1 command = [ ...] [[task]] id = 3 command = [ ...] [[task]] id = 5 command = [ ...] deps = [ 1 , 3 ] # <---- Dependency on tasks 1 and 3 Resource variants # More resource configurations may be defined for a task. In this case, HyperQueue will take into account all these configurations during scheduling. When a task is started exactly one configuration is chosen. If in a given moment more configuration are possible for a given task, the configuration first defined has a higher priority. The following configuration defines that a task may be executed on 1 cpus and 1 gpu OR on 4 cpus. [[task]] command = [ ...] [[task.request]] resources = { \"cpus\" = 1 , \"gpus\" = 1 } [[task.request]] resources = { \"cpus\" = 4 } In the case that many tasks with such a configuration are submitted to a worker with 16 cpus and 4 gpus then HyperQueue will run simultaneously 4 tasks in the first configuration and 3 tasks in the second one. For a task with resource variants, HyperQueue sets variable HQ_RESOURCE_VARIANT to an index of chosen variant (counted from 0) when a task is started. Non-integer resource amounts # You may specify a resource number as float, e.g. resources = { \"foo\" = 1.5 } . It is valid but internally the type if converted to float, that may for some numbers lead to a rounding up when number is converted to 4-digit precision of resource amounts. If you want to avoid this, put the number into parentheses, e.g. resources = { \"foo\" = \"1.5\" } .","title":"Job Definition File"},{"location":"jobs/jobfile/#job-definition-file","text":"Job Definition File (JDF) a way how to submit a complex pipeline into a HyperQueue. It is a TOML file that describes tasks of a job. JDF provides all functionalities as command line interface of HyperQueue and also adds access to additional features: Heterogeneous tasks -- Job may be composed of different tasks Dependencies -- Tasks may have dependencies Resource request alternatives -- Task may have alternative resource requests, e.g.: 4 cpus OR 1 cpus and 1 gpu Note that these features are also available through Python interface.","title":"Job Definition File"},{"location":"jobs/jobfile/#minimal-example","text":"First, we create file with the following content: [[task]] command = [ \"sleep\" , \"1\" ] Let us assume that we have named this file as myfile.toml , then we can run the following command to submit a job: $ hq job submit-file myfile.toml The effect will be same as running: $ hq submit sleep 1","title":"Minimal example"},{"location":"jobs/jobfile/#task-configuration","text":"The following shows how job and task may be configured in more detail. All options except command are optional. If not said otherwise, an option in format xxx = ... is an equivalent of --xxx = ... in hq submit command. The default are the same as CLI interface. name = \"test-job\" stream = \"path/to/stream/dir\" # Stdout/Stderr streaming (see --stream) max_fails = 11 [[task]] stdout = \"testout-%{TASK_ID}\" stderr = { path = \"testerr-%{TASK_ID}\" , mode = \"rm-if-finished\" } task_dir = true time_limit = \"1m 10s\" priority = -1 crash_limit = 12 command = [ \"/bin/bash\" , \"-c\" , \"echo $ABC\" ] # Environment variables env = { \"ABC\" = \"123\" , \"XYZ\" = \"aaaa\" } # Content that will be written on stdin stdin = \"Hello world!\" [[task.request]] resources = { \"cpus\" = \"4 compact!\" , \"gpus\" = 2 } time_request = \"10s\"","title":"Task configuration"},{"location":"jobs/jobfile/#more-tasks","text":"More tasks with different configuration may be defined as follows: [[task]] command = [ \"sleep\" , \"1\" ] [[task]] command = [ \"sleep\" , \"2\" ] [[task]] command = [ \"sleep\" , \"3\" ] In the case above, tasks are given automatic task ids from id 0. You can also specify IDs manually: [[task]] id = 10 command = [ \"sleep\" , \"1\" ] [[task]] id = 11 command = [ \"sleep\" , \"2\" ] [[task]] id = 2 command = [ \"sleep\" , \"3\" ]","title":"More tasks"},{"location":"jobs/jobfile/#task-arrays","text":"If you want to create uniform tasks you can define task array (similar to --array ): [[array]] ids = \"1,2,50-100\" command = [ \"sleep\" , \"1\" ] You can also specify array with content of HQ_ENTRIES : [[array]] entries = [ \"One\" , \"Two\" , \"Three\" ] command = [ \"sleep\" , \"1\" ] Note Options entries and ids can be used together.","title":"Task arrays"},{"location":"jobs/jobfile/#task-dependencies","text":"Job Definition File allows to define a dependencies between tasks. In other words, it means that the task may be executed only if the previous tasks are already finished. The task's option deps defines on which tasks the given task dependents. The task is addressed by their IDs. The following example creates three tasks where the third task depends on the first two tasks. [[task]] id = 1 command = [ ...] [[task]] id = 3 command = [ ...] [[task]] id = 5 command = [ ...] deps = [ 1 , 3 ] # <---- Dependency on tasks 1 and 3","title":"Task dependencies"},{"location":"jobs/jobfile/#resource-variants","text":"More resource configurations may be defined for a task. In this case, HyperQueue will take into account all these configurations during scheduling. When a task is started exactly one configuration is chosen. If in a given moment more configuration are possible for a given task, the configuration first defined has a higher priority. The following configuration defines that a task may be executed on 1 cpus and 1 gpu OR on 4 cpus. [[task]] command = [ ...] [[task.request]] resources = { \"cpus\" = 1 , \"gpus\" = 1 } [[task.request]] resources = { \"cpus\" = 4 } In the case that many tasks with such a configuration are submitted to a worker with 16 cpus and 4 gpus then HyperQueue will run simultaneously 4 tasks in the first configuration and 3 tasks in the second one. For a task with resource variants, HyperQueue sets variable HQ_RESOURCE_VARIANT to an index of chosen variant (counted from 0) when a task is started.","title":"Resource variants"},{"location":"jobs/jobfile/#non-integer-resource-amounts","text":"You may specify a resource number as float, e.g. resources = { \"foo\" = 1.5 } . It is valid but internally the type if converted to float, that may for some numbers lead to a rounding up when number is converted to 4-digit precision of resource amounts. If you want to avoid this, put the number into parentheses, e.g. resources = { \"foo\" = \"1.5\" } .","title":"Non-integer resource amounts"},{"location":"jobs/jobs/","text":"The main unit of computation within HyperQueue is called a Task . It represents a single computation (currently, a single execution of some program) that is scheduled and executed on a worker. To actually compute something, you have to create a Job , which is a collection of tasks (a task graph). Jobs are units of computation management - you can submit, query or cancel jobs using the CLI. Note This section focuses on simple jobs , where each job contains exactly one task. See Task arrays to find out how to create jobs with multiple tasks. Identification numbers # Each job is identified by a positive integer that is assigned by the HyperQueue server when the job is submitted. We refer to it as Job id . Each task within a job is identified by an unsigned 32b integer called Task id . Task id is either generated by the server or assigned by the user. Task ids are always relative to a specific job, two tasks inside different jobs can thus have the same task id. In simple jobs, task id is always set to 0 . Submitting jobs # To submit a simple job that will execute some executable with the provided arguments, use the hq submit command: $ hq submit ... When you submit a job, the server will assign it a unique job id and print it. You can use this ID in following commands to refer to the submitted job. After the job is submitted, HyperQueue will distribute it to a connected worker that will then execute the provided command. Warning The provided command will be executed on a worker that might be running on a different machine. You should thus make sure that the binary will be available there and that you provide an absolute path to it. Note When your command contains its own command line flags, you must put the command and its flags after -- : $ hq submit -- /bin/bash -c 'echo $PPID' There are many parameters that you can set for the executed program, they are listed below. Name # Each job has an assigned name. It has only an informative character for the user. By default, the name is derived from the job's program name. You can also set the job name explicitly with the --name option: $ hq submit --name = ... Working directory # By default, the working directory of the job will be set to the directory from which the job was submitted. You can change this using the --cwd option: $ hq submit --cwd = ... Warning Make sure that the provided path exists on all worker nodes. Hint You can use placeholders in the working directory path. Output # By default, each job will produce two files containing the standard output and standard error output, respectively. The default paths of these files are %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stdout for stdout %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stderr for stderr %{JOB_ID} and %{TASK_ID} are so-called placeholders, you can read about them below . You can change these paths with the --stdout and --stderr options. You can also avoid creating stdout / stderr files completely by setting the value to none : Change output paths Disable stdout $ hq submit --stdout = out.txt --stderr = err.txt ... $ hq submit --stdout = none ... Warning Make sure that the provided path(s) exist on all worker nodes. Also note that if you provide a relative path, it will be resolved relative to the directory from where you submit the job, not relative to the working directory of the job. If you want to change that, use the %{CWD} placeholder . Environment variables # You can set environment variables which will be passed to the provided command when the job is executed using the --env = option. Multiple environment variables can be passed if you repeat the option. $ hq submit --env KEY1 = VAL1 --env KEY2 = VAL2 ... Each executed task will also automatically receive the following environment variables: Variable name Explanation HQ_JOB_ID Job id HQ_TASK_ID Task id HQ_INSTANCE_ID Instance id HQ_RESOURCE_... A set of variables related to allocated resources Time management # You can specify two time-related parameters when submitting a job. They will be applied to each task of the submitted job. Time Limit is the maximal running time of a task. If it is reached, the task will be terminated, and it will transition into the Failed state . This setting has no impact on scheduling. This can serve as a sanity check to make sure that some task will not run indefinitely. You can set it with the --time-limit option 1 : $ hq submit --time-limit = ... Note Time limit is counted separately for each task. If you set a time limit of 3 minutes and create two tasks, where each will run for two minutes, the time limit will not be hit. Time Request is the minimal remaining lifetime that a worker must have in order to start executing the task. Workers that do not have enough remaining lifetime will not be considered for running this task. Time requests are only used during scheduling, where the server decides which worker should execute which task. Once a task is scheduled and starts executing on a worker, the time request value will not have any effect. You can set the time request using the --time-request option 1 : $ hq submit --time-request = ... Note Workers with an unknown remaining lifetime will be able to execute any task, disregarding its time request. Here is an example situation where time limit and time request can be used: Let's assume that we have a collection of tasks where the vast majority of tasks usually finish within 10 minutes, but some of them run for (at most) 30 minutes. We do not know in advance which tasks will be \"slow\". In this case we may want to set the time limit to 35 minutes to protect us against an error (deadlock, endless loop, etc.). However, since we know that each task will usually take at least 10 minutes to execute, we don't want to start executing it on a worker if we know that the worker will definitely terminate in less than 10 minutes. It would only cause unnecessary lost computational resources. Therefore, we can set the time request to 10 minutes. Priority # You can modify the order in which tasks are executed using Priority . Priority can be any 32b signed integer. A lower number signifies lower priority, e.g. when task A with priority 5 and task B with priority 3 are scheduled to the same worker and only one of them may be executed, then A will be executed first. You can set the priority using the --priority option: $hq submit --priority = If no priority is specified, then each task will have priority 0 . Placeholders # You can use special variables when setting certain job parameters ( working directory , output paths, log path). These variables, called Placeholders , will be replaced by job or task-specific information before the job is executed. Placeholders are enclosed in curly braces ( {} ) and prefixed with a percent ( % ) sign. You can use the following placeholders: Placeholder Will be replaced by Available for %{JOB_ID} Job ID stdout , stderr , cwd , log %{TASK_ID} Task ID stdout , stderr , cwd %{INSTANCE_ID} Instance ID stdout , stderr , cwd %{SUBMIT_DIR} Directory from which the job was submitted. stdout , stderr , cwd , log %{CWD} Working directory of the task. stdout , stderr %{SERVER_UID} Unique server ID. stdout , stderr , cwd , log SERVER_UID is a random string that is unique for each new server execution (each hq server start gets a separate value). As an example, if you wanted to include the Instance ID in the stdout path (to distinguish the individual outputs of restarted tasks), you can use placeholders like this: $ hq submit --stdout '%{CWD}/job-%{JOB_ID}/%{TASK_ID}-%{INSTANCE_ID}.stdout' ... State # At any moment in time, each task and job has a specific state that represents what is currently happening to it. You can query the state of a job with the following command 2 : $ hq job info Task state # Each task starts in the Waiting state and can end up in one of the terminal states: Finished , Failed or Canceled . Waiting-----------------\\ | ^ | | | | v | | Running-----------------| | | | | \\--------\\ | | | | v v v Finished Failed Canceled Waiting The task was submitted and is now waiting to be executed. Running The task is running on a worker. It may become Waiting again when the worker where the task is running crashes. Finished The task has successfully finished. Failed The task has failed. Canceled The task has been canceled . If a task is in the Finished , Failed or Canceled state, it is completed . Job state # The state of a job is derived from the states of its individual tasks. The state is determined by the first rule that matches from the following list of rules: If at least one task is Running , then job state is Running . If at least one task has not been completed yet, then job state is Waiting . If at least one task is Failed , then job state is Failed . If at least one task is Canceled , then job state is Canceled . If all tasks are finished and job is open (see Open Jobs ), then job state is Opened . Remaining case: all tasks are Finished and job is closed, then job state is Finished . Cancelling jobs # You can prematurely terminate a submitted job that haven't been completed yet by cancelling it using the hq job cancel command 2 : $ hq job cancel Cancelling a job will cancel all of its tasks that are not yet completed. Forgetting jobs # If you want to completely forget a job, and thus free up its associated memory, you can do that using the hq job forget command 2 : $ hq job forget By default, all completed jobs (finished/failed/canceled) will be forgotten. You can use the --status parameter to only forget jobs in certain statuses: $ hq job forget all --status finished,canceled However, only jobs that are completed, i.e. that have been finished successfully, failed or have been canceled, can be forgotten. If you want to forget a waiting or a running job, cancel it first. Waiting for jobs # There are three ways of waiting until a job completes: Submit and wait You can use the --wait flag when submitting a job. This will cause the submission command to wait until the job becomes complete: $ hq submit --wait ... Tip This method can be used for benchmarking the job duration. Wait command There is a separate hq job wait command that can be used to wait until an existing job completes 2 : $ hq job wait Interactive wait If you want to interactively observe the status of a job (which is useful especially if it has multiple tasks ), you can use the hq job progress command: Submit and observe Observe an existing job 2 $ hq submit --progress ... $ hq job progress Attaching standard input # When --stdin flag is used, HQ captures standard input and attaches it to each task of a job. When a task is started then the attached data is written into the standard input of the task. This can be used to submitting scripts without creating file. The following command will capture stdin and executes it in Bash $ hq submit --stdin bash If you want to parse #HQ directives from standard input, you can use --directives=stdin . Task directory # When a job is submitted with --task-dir then a temporary directory is created for each task and passed via environment variable HQ_TASK_DIR . This directory is automatically deleted when the task is completed (for any reason). Providing own error message # A task may pass its own error message into the HyperQueue. HyperQueue provides a filename via environment variable HQ_ERROR_FILENAME , if a task creates this file and terminates with a non-zero return code, then the content of this file is taken as an error message. HQ_ERROR_FILENAME is provided only if task directory is set on. The filename is always placed inside the task directory. If the message is longer than 2KiB, then it is truncated to 2KiB. If task terminates with zero return code, then the error file is ignored. Automatic file cleanup # If you create a lot of tasks and do not use output streaming , a lot of stdout / stderr files can be created on the disk. In certain cases, you might not be interested in the contents of these files, especially if the task has finished successfully, and you instead want to remove them as soon as they are not needed. For that, you can use a file cleanup mode when specifying stdout and/or stderr to choose what should happen with the file when its task finishes. The mode is specified as a name following a colon ( : ) after the file path. Currently, one cleanup mode is implemented: Remove the file if the task has finished successfully: $ hq submit --stdout = \"out.txt:rm-if-finished\" /my-program The file will not be deleted if the task fails or is cancelled. Note If you want to use the default stdout / stderr file path (and you don't want to look it up), you can also specify just the cleanup mode without the file path: $ hq submit --stdout = \":rm-if-finished\" /my-program Useful job commands # Here is a list of useful job commands: Display job table # List queued and running jobs List all jobs List jobs by status $ hq job list $ hq job list --all You can display only jobs having the selected states by using the --filter flag: $ hq job list --filter running,waiting Valid filter values are: waiting running finished failed canceled Display a summary table of all jobs # $ hq job summary Display information about a specific job # $ hq job info Display information about individual tasks (potentially across multiple jobs) # $ hq task list [--task-status ] [--tasks ] Display job stdout / stderr # $ hq job cat [--tasks ] Crashing limit # When a worker is lost then all running tasks on the worker are suspicious that they may cause the crash of the worker. HyperQueue server remembers how many times were a task running while a worker is lost. If the count reaches the limit, then the task is set to the failed state. By default, this limit is 5 but it can be changed as follows: $ hq submit --crash-limit= ... If the limit is set to 0, then the limit is disabled. You can use various shortcuts for the duration value. \u21a9 \u21a9 You can use various shortcuts to select multiple jobs at once. \u21a9 \u21a9 \u21a9 \u21a9 \u21a9","title":"Jobs and Tasks"},{"location":"jobs/jobs/#identification-numbers","text":"Each job is identified by a positive integer that is assigned by the HyperQueue server when the job is submitted. We refer to it as Job id . Each task within a job is identified by an unsigned 32b integer called Task id . Task id is either generated by the server or assigned by the user. Task ids are always relative to a specific job, two tasks inside different jobs can thus have the same task id. In simple jobs, task id is always set to 0 .","title":"Identification numbers"},{"location":"jobs/jobs/#submitting-jobs","text":"To submit a simple job that will execute some executable with the provided arguments, use the hq submit command: $ hq submit ... When you submit a job, the server will assign it a unique job id and print it. You can use this ID in following commands to refer to the submitted job. After the job is submitted, HyperQueue will distribute it to a connected worker that will then execute the provided command. Warning The provided command will be executed on a worker that might be running on a different machine. You should thus make sure that the binary will be available there and that you provide an absolute path to it. Note When your command contains its own command line flags, you must put the command and its flags after -- : $ hq submit -- /bin/bash -c 'echo $PPID' There are many parameters that you can set for the executed program, they are listed below.","title":"Submitting jobs"},{"location":"jobs/jobs/#name","text":"Each job has an assigned name. It has only an informative character for the user. By default, the name is derived from the job's program name. You can also set the job name explicitly with the --name option: $ hq submit --name = ...","title":"Name"},{"location":"jobs/jobs/#working-directory","text":"By default, the working directory of the job will be set to the directory from which the job was submitted. You can change this using the --cwd option: $ hq submit --cwd = ... Warning Make sure that the provided path exists on all worker nodes. Hint You can use placeholders in the working directory path.","title":"Working directory"},{"location":"jobs/jobs/#output","text":"By default, each job will produce two files containing the standard output and standard error output, respectively. The default paths of these files are %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stdout for stdout %{CWD}/job-%{JOB_ID}/%{TASK_ID}.stderr for stderr %{JOB_ID} and %{TASK_ID} are so-called placeholders, you can read about them below . You can change these paths with the --stdout and --stderr options. You can also avoid creating stdout / stderr files completely by setting the value to none : Change output paths Disable stdout $ hq submit --stdout = out.txt --stderr = err.txt ... $ hq submit --stdout = none ... Warning Make sure that the provided path(s) exist on all worker nodes. Also note that if you provide a relative path, it will be resolved relative to the directory from where you submit the job, not relative to the working directory of the job. If you want to change that, use the %{CWD} placeholder .","title":"Output"},{"location":"jobs/jobs/#environment-variables","text":"You can set environment variables which will be passed to the provided command when the job is executed using the --env = option. Multiple environment variables can be passed if you repeat the option. $ hq submit --env KEY1 = VAL1 --env KEY2 = VAL2 ... Each executed task will also automatically receive the following environment variables: Variable name Explanation HQ_JOB_ID Job id HQ_TASK_ID Task id HQ_INSTANCE_ID Instance id HQ_RESOURCE_... A set of variables related to allocated resources","title":"Environment variables"},{"location":"jobs/jobs/#time-management","text":"You can specify two time-related parameters when submitting a job. They will be applied to each task of the submitted job. Time Limit is the maximal running time of a task. If it is reached, the task will be terminated, and it will transition into the Failed state . This setting has no impact on scheduling. This can serve as a sanity check to make sure that some task will not run indefinitely. You can set it with the --time-limit option 1 : $ hq submit --time-limit = ... Note Time limit is counted separately for each task. If you set a time limit of 3 minutes and create two tasks, where each will run for two minutes, the time limit will not be hit. Time Request is the minimal remaining lifetime that a worker must have in order to start executing the task. Workers that do not have enough remaining lifetime will not be considered for running this task. Time requests are only used during scheduling, where the server decides which worker should execute which task. Once a task is scheduled and starts executing on a worker, the time request value will not have any effect. You can set the time request using the --time-request option 1 : $ hq submit --time-request = ... Note Workers with an unknown remaining lifetime will be able to execute any task, disregarding its time request. Here is an example situation where time limit and time request can be used: Let's assume that we have a collection of tasks where the vast majority of tasks usually finish within 10 minutes, but some of them run for (at most) 30 minutes. We do not know in advance which tasks will be \"slow\". In this case we may want to set the time limit to 35 minutes to protect us against an error (deadlock, endless loop, etc.). However, since we know that each task will usually take at least 10 minutes to execute, we don't want to start executing it on a worker if we know that the worker will definitely terminate in less than 10 minutes. It would only cause unnecessary lost computational resources. Therefore, we can set the time request to 10 minutes.","title":"Time management"},{"location":"jobs/jobs/#priority","text":"You can modify the order in which tasks are executed using Priority . Priority can be any 32b signed integer. A lower number signifies lower priority, e.g. when task A with priority 5 and task B with priority 3 are scheduled to the same worker and only one of them may be executed, then A will be executed first. You can set the priority using the --priority option: $hq submit --priority = If no priority is specified, then each task will have priority 0 .","title":"Priority"},{"location":"jobs/jobs/#placeholders","text":"You can use special variables when setting certain job parameters ( working directory , output paths, log path). These variables, called Placeholders , will be replaced by job or task-specific information before the job is executed. Placeholders are enclosed in curly braces ( {} ) and prefixed with a percent ( % ) sign. You can use the following placeholders: Placeholder Will be replaced by Available for %{JOB_ID} Job ID stdout , stderr , cwd , log %{TASK_ID} Task ID stdout , stderr , cwd %{INSTANCE_ID} Instance ID stdout , stderr , cwd %{SUBMIT_DIR} Directory from which the job was submitted. stdout , stderr , cwd , log %{CWD} Working directory of the task. stdout , stderr %{SERVER_UID} Unique server ID. stdout , stderr , cwd , log SERVER_UID is a random string that is unique for each new server execution (each hq server start gets a separate value). As an example, if you wanted to include the Instance ID in the stdout path (to distinguish the individual outputs of restarted tasks), you can use placeholders like this: $ hq submit --stdout '%{CWD}/job-%{JOB_ID}/%{TASK_ID}-%{INSTANCE_ID}.stdout' ...","title":"Placeholders"},{"location":"jobs/jobs/#state","text":"At any moment in time, each task and job has a specific state that represents what is currently happening to it. You can query the state of a job with the following command 2 : $ hq job info ","title":"State"},{"location":"jobs/jobs/#task-state","text":"Each task starts in the Waiting state and can end up in one of the terminal states: Finished , Failed or Canceled . Waiting-----------------\\ | ^ | | | | v | | Running-----------------| | | | | \\--------\\ | | | | v v v Finished Failed Canceled Waiting The task was submitted and is now waiting to be executed. Running The task is running on a worker. It may become Waiting again when the worker where the task is running crashes. Finished The task has successfully finished. Failed The task has failed. Canceled The task has been canceled . If a task is in the Finished , Failed or Canceled state, it is completed .","title":"Task state"},{"location":"jobs/jobs/#job-state","text":"The state of a job is derived from the states of its individual tasks. The state is determined by the first rule that matches from the following list of rules: If at least one task is Running , then job state is Running . If at least one task has not been completed yet, then job state is Waiting . If at least one task is Failed , then job state is Failed . If at least one task is Canceled , then job state is Canceled . If all tasks are finished and job is open (see Open Jobs ), then job state is Opened . Remaining case: all tasks are Finished and job is closed, then job state is Finished .","title":"Job state"},{"location":"jobs/jobs/#cancelling-jobs","text":"You can prematurely terminate a submitted job that haven't been completed yet by cancelling it using the hq job cancel command 2 : $ hq job cancel Cancelling a job will cancel all of its tasks that are not yet completed.","title":"Cancelling jobs"},{"location":"jobs/jobs/#forgetting-jobs","text":"If you want to completely forget a job, and thus free up its associated memory, you can do that using the hq job forget command 2 : $ hq job forget By default, all completed jobs (finished/failed/canceled) will be forgotten. You can use the --status parameter to only forget jobs in certain statuses: $ hq job forget all --status finished,canceled However, only jobs that are completed, i.e. that have been finished successfully, failed or have been canceled, can be forgotten. If you want to forget a waiting or a running job, cancel it first.","title":"Forgetting jobs"},{"location":"jobs/jobs/#waiting-for-jobs","text":"There are three ways of waiting until a job completes: Submit and wait You can use the --wait flag when submitting a job. This will cause the submission command to wait until the job becomes complete: $ hq submit --wait ... Tip This method can be used for benchmarking the job duration. Wait command There is a separate hq job wait command that can be used to wait until an existing job completes 2 : $ hq job wait Interactive wait If you want to interactively observe the status of a job (which is useful especially if it has multiple tasks ), you can use the hq job progress command: Submit and observe Observe an existing job 2 $ hq submit --progress ... $ hq job progress ","title":"Waiting for jobs"},{"location":"jobs/jobs/#attaching-standard-input","text":"When --stdin flag is used, HQ captures standard input and attaches it to each task of a job. When a task is started then the attached data is written into the standard input of the task. This can be used to submitting scripts without creating file. The following command will capture stdin and executes it in Bash $ hq submit --stdin bash If you want to parse #HQ directives from standard input, you can use --directives=stdin .","title":"Attaching standard input"},{"location":"jobs/jobs/#task-directory","text":"When a job is submitted with --task-dir then a temporary directory is created for each task and passed via environment variable HQ_TASK_DIR . This directory is automatically deleted when the task is completed (for any reason).","title":"Task directory"},{"location":"jobs/jobs/#providing-own-error-message","text":"A task may pass its own error message into the HyperQueue. HyperQueue provides a filename via environment variable HQ_ERROR_FILENAME , if a task creates this file and terminates with a non-zero return code, then the content of this file is taken as an error message. HQ_ERROR_FILENAME is provided only if task directory is set on. The filename is always placed inside the task directory. If the message is longer than 2KiB, then it is truncated to 2KiB. If task terminates with zero return code, then the error file is ignored.","title":"Providing own error message"},{"location":"jobs/jobs/#automatic-file-cleanup","text":"If you create a lot of tasks and do not use output streaming , a lot of stdout / stderr files can be created on the disk. In certain cases, you might not be interested in the contents of these files, especially if the task has finished successfully, and you instead want to remove them as soon as they are not needed. For that, you can use a file cleanup mode when specifying stdout and/or stderr to choose what should happen with the file when its task finishes. The mode is specified as a name following a colon ( : ) after the file path. Currently, one cleanup mode is implemented: Remove the file if the task has finished successfully: $ hq submit --stdout = \"out.txt:rm-if-finished\" /my-program The file will not be deleted if the task fails or is cancelled. Note If you want to use the default stdout / stderr file path (and you don't want to look it up), you can also specify just the cleanup mode without the file path: $ hq submit --stdout = \":rm-if-finished\" /my-program","title":"Automatic file cleanup"},{"location":"jobs/jobs/#useful-job-commands","text":"Here is a list of useful job commands:","title":"Useful job commands"},{"location":"jobs/jobs/#display-job-table","text":"List queued and running jobs List all jobs List jobs by status $ hq job list $ hq job list --all You can display only jobs having the selected states by using the --filter flag: $ hq job list --filter running,waiting Valid filter values are: waiting running finished failed canceled","title":"Display job table"},{"location":"jobs/jobs/#display-a-summary-table-of-all-jobs","text":"$ hq job summary","title":"Display a summary table of all jobs"},{"location":"jobs/jobs/#display-information-about-a-specific-job","text":"$ hq job info ","title":"Display information about a specific job"},{"location":"jobs/jobs/#display-information-about-individual-tasks-potentially-across-multiple-jobs","text":"$ hq task list [--task-status ] [--tasks ]","title":"Display information about individual tasks (potentially across multiple jobs)"},{"location":"jobs/jobs/#display-job-stdoutstderr","text":"$ hq job cat [--tasks ] ","title":"Display job stdout/stderr"},{"location":"jobs/jobs/#crashing-limit","text":"When a worker is lost then all running tasks on the worker are suspicious that they may cause the crash of the worker. HyperQueue server remembers how many times were a task running while a worker is lost. If the count reaches the limit, then the task is set to the failed state. By default, this limit is 5 but it can be changed as follows: $ hq submit --crash-limit= ... If the limit is set to 0, then the limit is disabled. You can use various shortcuts for the duration value. \u21a9 \u21a9 You can use various shortcuts to select multiple jobs at once. \u21a9 \u21a9 \u21a9 \u21a9 \u21a9","title":"Crashing limit"},{"location":"jobs/multinode/","text":"Warning Multi-node support is now in the experimental stage. The core functionality is working, but some features may be limited and quality of scheduling may vary. Also auto allocation feature is not yet fully prepared for multi-node tasks. Multi-node tasks are tasks that spreads across multiple nodes. Each node reserved for such task is exclusively reserved, i.e. no other tasks may run on such nodes. A job with multi-node task can be specified by --nodes=X option. An example of a job with multi-node task asking for 4 nodes: $ hq submit --nodes 4 test.sh When the task is started, four nodes are assigned to this task. One of them is chosen as \"root\" node where test.sh is started. Node names of all assigned nodes can be found in file which path is in environmental variable HQ_NODE_FILE . Each line is a node name. The first line is always the root node. The node is a short hostname, i.e. hostname stripped by a suffix after first \".\" (e.g. if a hostname of worker is \"cn690.karolina.it4i.cz\" then node name is \"cn690\"). Many HPC applications use only short hostnames, hence we provide them as default. If you need a full hostnames, there is file which name is written in HQ_HOST_FILE and it has the same meaning as NQ_NODE_FILE but contains the full node hostnames without stripping. Note: Both files are placed in task directory; therefore, a multi-node tasks always enables task directory ( --task-dir ). If a multinode task is started, HQ also creates variable HQ_NUM_NODES that holds the number of nodes assigned to a task (i.e. the number of lines of the node file) Groups # A multi-node task is started only on workers that belong to the same group. By default, workers are grouped by PBS/Slurm allocations and workers outside any allocation are put in \"default\" group. A group of a worker can be specified at the start of the worker and it may be any string. Example: $ hq worker start --group my_group Running MPI tasks # A script that starts an MPI program in multi-node task may look like as follows: #!/bin/sh mpirun --node-list = $HQ_NODE_FILE ./a-program If you are running SLURM you should start the MPI program as follows: #!/bin/sh srun --nodefile=$HQ_NODE_FILE --nodes=$HQ_NUM_NODES mpirun ... Note: It is important to set --nodes otherwise the node file will not be respected.","title":"Multinode Tasks"},{"location":"jobs/multinode/#groups","text":"A multi-node task is started only on workers that belong to the same group. By default, workers are grouped by PBS/Slurm allocations and workers outside any allocation are put in \"default\" group. A group of a worker can be specified at the start of the worker and it may be any string. Example: $ hq worker start --group my_group","title":"Groups"},{"location":"jobs/multinode/#running-mpi-tasks","text":"A script that starts an MPI program in multi-node task may look like as follows: #!/bin/sh mpirun --node-list = $HQ_NODE_FILE ./a-program If you are running SLURM you should start the MPI program as follows: #!/bin/sh srun --nodefile=$HQ_NODE_FILE --nodes=$HQ_NUM_NODES mpirun ... Note: It is important to set --nodes otherwise the node file will not be respected.","title":"Running MPI tasks"},{"location":"jobs/openjobs/","text":"Open jobs # By default, a job is a set of tasks that are created atomically during a submit, and no other task can be added to the job. We call this job closed . In contrast, HQ allows you to create an open job that allows new tasks to be submitted as long as it is open. Opening a job # A job can be opened by the following command: $ hq job open If opening was successful, this will be printed: Job is open. If you want to get just ID without any additional text, you can open job as follows: $ hq --output-mode=quiet job open Note: In the list of jobs, an open job is marked with \"*\" before the id. Submitting tasks into open jobs # A submit to an open job is the same as a normal submit, except that you must specify the job you are submitting to with the --job argument. You may submit multiple times into the same job. Tasks are scheduled to the workers immediately when they are received by the server. $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ... Task Ids # All tasks in one job share the task ID space. When you do not specify task ids, HQ automatically assigns a smallest ID that is bigger then any existing task id. $ hq job open $ hq submit --job -- hostname # Task ID is 0 $ hq submit --job -- hostname # Task ID is 1 # Task IDs are 2, 3, 4 ... $ hq submit --job --each-line='test.txt' -- do-something If you are explicitly specifying task IDs, it is an error if task ID is reused: $ hq submit --job -- hostname # Task ID is 0 # This is Ok $ hq submit --job --array 10-20 -- hostname # This fails: Task ID 0 and 10, 11, 12 already exist $ hq submit --job --array 0-12 -- hostname Job name and --max-fails # Job's name and configuration open --max-fails are the property of the job. They can be set when job is opened and they cannot be later changed. Submit options --name and --max-fails cannot be used if you are submitting into an open job. # Configuring jobs's name and max fails $ hq job open --name=MyOpenJob --max-fails=10 # Submit fails becase --max-fails cannot be used together with --job $ hq submit --job --max-fails=5 ... Submit file into open job # Submitting job definition file into an open job works in the similar way as a normal submit, you just need to add --job parameter. $ hq job submit-file --job job-definition.toml Closing job # You can close a job by calling: $ hq job close When a job is closed, you are not allowed to submit any more tasks to the job. It has no effect on tasks already submitted to the job; they continue to be processed as usual. Closing of already closed job throws an error. Leaving open jobs has no overhead, but it does affect the semantics of job completion. A job is considered completed when all tasks have been completed and the job is closed . Therefore, hq job wait ... will wait until all tasks of the selected jobs are complete and the jobs are closed. If you want to wait only for completion of tasks and ignoring if job is open or closed then there is hq job wait --without-close ... .","title":"Open jobs"},{"location":"jobs/openjobs/#open-jobs","text":"By default, a job is a set of tasks that are created atomically during a submit, and no other task can be added to the job. We call this job closed . In contrast, HQ allows you to create an open job that allows new tasks to be submitted as long as it is open.","title":"Open jobs"},{"location":"jobs/openjobs/#opening-a-job","text":"A job can be opened by the following command: $ hq job open If opening was successful, this will be printed: Job is open. If you want to get just ID without any additional text, you can open job as follows: $ hq --output-mode=quiet job open Note: In the list of jobs, an open job is marked with \"*\" before the id.","title":"Opening a job"},{"location":"jobs/openjobs/#submitting-tasks-into-open-jobs","text":"A submit to an open job is the same as a normal submit, except that you must specify the job you are submitting to with the --job argument. You may submit multiple times into the same job. Tasks are scheduled to the workers immediately when they are received by the server. $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ... $ hq submit --job ... other submit args ...","title":"Submitting tasks into open jobs"},{"location":"jobs/openjobs/#task-ids","text":"All tasks in one job share the task ID space. When you do not specify task ids, HQ automatically assigns a smallest ID that is bigger then any existing task id. $ hq job open $ hq submit --job -- hostname # Task ID is 0 $ hq submit --job -- hostname # Task ID is 1 # Task IDs are 2, 3, 4 ... $ hq submit --job --each-line='test.txt' -- do-something If you are explicitly specifying task IDs, it is an error if task ID is reused: $ hq submit --job -- hostname # Task ID is 0 # This is Ok $ hq submit --job --array 10-20 -- hostname # This fails: Task ID 0 and 10, 11, 12 already exist $ hq submit --job --array 0-12 -- hostname","title":"Task Ids"},{"location":"jobs/openjobs/#job-name-and-max-fails","text":"Job's name and configuration open --max-fails are the property of the job. They can be set when job is opened and they cannot be later changed. Submit options --name and --max-fails cannot be used if you are submitting into an open job. # Configuring jobs's name and max fails $ hq job open --name=MyOpenJob --max-fails=10 # Submit fails becase --max-fails cannot be used together with --job $ hq submit --job --max-fails=5 ...","title":"Job name and --max-fails"},{"location":"jobs/openjobs/#submit-file-into-open-job","text":"Submitting job definition file into an open job works in the similar way as a normal submit, you just need to add --job parameter. $ hq job submit-file --job job-definition.toml","title":"Submit file into open job"},{"location":"jobs/openjobs/#closing-job","text":"You can close a job by calling: $ hq job close When a job is closed, you are not allowed to submit any more tasks to the job. It has no effect on tasks already submitted to the job; they continue to be processed as usual. Closing of already closed job throws an error. Leaving open jobs has no overhead, but it does affect the semantics of job completion. A job is considered completed when all tasks have been completed and the job is closed . Therefore, hq job wait ... will wait until all tasks of the selected jobs are complete and the jobs are closed. If you want to wait only for completion of tasks and ignoring if job is open or closed then there is hq job wait --without-close ... .","title":"Closing job"},{"location":"jobs/resources/","text":"Resource management # Resource management serves for defining arbitrary resources provided by workers and also corresponding resource requests required by tasks. HyperQueue will take care of matching task resource requests so that only workers that can fulfill them will be able to execute such tasks. Some generic resources are automatically detected ; however, users may also define their own resources. From version 0.13.0, CPUs are also managed as other resources, but they have still some extra functionality; therefore, there is a special section about CPU resources . Important Resources in HyperQueue exist on a purely logical level. They can correspond to physical things (like GPUs), but it is the responsibility of the user to make sure that this correspondence makes sense. With exception of CPUs, HyperQueue by itself does not attach any semantics to resources, they are just numbers used for scheduling. Worker resources # Each worker has one or mores resources attached. Each resource is a resource pool identified by a name. A resource pool represents some resources provided by a worker; each task can then ask for a part of the resources contained in that pool. There are two kinds of resource pools: Indexed pool : This pool represents an enumerated set of resources represented by strings. Each resource has its own identity. Tasks do not ask for specific values from the set, they just specify how many resources they require and HyperQueue will allocate the specified amount of resources from the pool for each task. This pool is useful for resources that have their own identity, for example individual GPU or FPGA accelerators. HyperQueue guarantees that no individual resource from the indexed pool is allocated to more than a single task at any given time and that a task will not be executed on a worker if it does not currently have enough individual resources to fulfill the resource request of the task. Indexed pool can be defined with groups where indices live in separated groups. Task may then ask for different allocation policies (e.g. use resources from the same or different groups). The main purpose of this is to capture NUMA architectures, each group then represents a socket with cores. Sum pool : This pool represents a resource that has a certain size which is split into individual tasks. A typical example is memory; if a worker has 2000 bytes of memory, it can serve e.g. four tasks, if each task asks for 500 bytes of memory. HyperQueue guarantees that the sum of resource request sizes of running tasks on a worker does not exceed the total size of the sum pool. Specifying worker resources # You can specify the resource pools of a worker when you start it: $ hq worker start --resource \"=\" --resource \"=\" ... where NAMEi is a name (string ) of the i -th resource pool and DEFi is a definition of the i-th resource pool. You can define resource pools using one of the following formats: [, , ..., ] where VALUE is a string. This defines a an indexed pool with the given values. If you need to enter a string resource that contains special characters ( [ , ] , , , whitespace), you can wrap the value in quotes: [\"foo [,]\", bar, \"my resource\"] . range(-) where START and END are non-negative integers. This defines an indexed pool with numbers in the inclusive range [START, END] . [[, ..., ], [, ..., ], ...] where VALUE is a string. This defines an indexed pool where indices are grouped. x Creates indexed pool with N groups of size M, indices are indexed from 0, (e.g. \"2x3\" is equivalent to [[0, 1, 2], [3, 4, 5] ) sum() where SIZE is a positive integer. This defines a sum pool with the given size. Tip You might encounter a problem in your shell when you try to specify worker resources, because the definition contains parentheses ( () ). In that case just wrap the resource definition in quotes, like this: $ hq worker start --resource \"foo=sum(5)\" Resource names # Resource names are restricted by the following rules: They can only contain ASCII letters and digits ( a-z , A-Z , 0-9 ) and the slash ( / ) symbol. They need to begin with an ASCII letter. These restrictions exist because the resource names are passed as environment variable names to tasks, which often execute shell scripts. However, shells typically do not support environment variables containing anything else than ASCII letters, digits and the underscore symbol. Therefore, HQ limits resource naming to align with the behaviour of the shell. Important HQ will normalize the resource name when passing environment variables to a task (see below ). Automatically detected resources # The following resources are detected automatically if a resource of a given name is not explicitly defined. CPUs are automatically detected as resource named \"cpus\" (more in CPU resources ). GPUs that are available when a worker is started are automatically detected under the following resource names: NVIDIA GPUs are stored the under resource name gpus/nvidia . These GPUs are detected from the environment variable CUDA_VISIBLE_DEVICES or from the procfs filesystem. AMD GPUs are stored under the resource name gpus/amd . These GPUs are detected from the environment variable ROCR_VISIBLE_DEVICES . You can set these environment variables when starting a worker to override the list of available GPUs: $ CUDA_VISIBLE_DEVICES = 2 ,3 hq worker start # The worker will have resource gpus/nvidia=[2,3] RAM of the node is detected as resource \"mem\" in megabytes; i.e. --resource mem=100 asks for 100 MiBs of the memory. If you want to see how is your system seen by a worker without actually starting it, you can start: $ hq worker hwdetect The automatic detection of resources can be disabled by argument --no-detect-resources in hq worker start ... . It disables detection of resources other than \"cpus\"; if resource \"cpus\" are not explicitly defined, it will always be detected. Resource request # When you submit a job, you can define a resource requests with the --resource flag: $ hq submit --resource = --resource = ... Where NAME is a name of the requested resource and the AMOUNT is a positive number defining the size of the request. Tasks with such resource requests will only be executed on workers that fulfill all the specified task requests. Important Notice that task resource requests always ask for an amount of resources required by a task, regardless whether that resource corresponds to an indexed or a sum pool on workers. For example, let's say that a worker has an indexed pool of GPUs: $ hq worker start --resource \"gpus/nvidia=range(1-3)\" And we create two jobs, each with a single task. The first job wants 1 GPU, the second one wants two GPUs. $ hq submit --resource gpus/nvidia = 1 ... $ hq submit --resource gpus/nvidia = 2 ... Then the first job can be allocated e.g. the GPU 2 and the second job can be allocated the GPUs 1 and 3 . Requesting all resources # A task may ask for all given resources of that type by specifying --resource =all . Such a task will be scheduled only on a worker that has at least 1 of such resource and when a task is executed all resources of that type will be given to this task. Resource request strategies # When resource request is defined, after the amount you can define allocation strategy: --resource =\" \" . Specifying strategy has effect only if worker provides indexed resource in groups. If resource is other type, then strategy is ignored. When strategy is not defined then compact is used as default. Compact ( compact ) - Tries to allocate indices in few groups as possible in the current worker state. Example: $ hq submit --resource cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocate indices on as few groups as possible for a target node. The task is not executed until the requirement could not be fully fulfilled. E.g. If a worker has 4 indices per a group and you ask for 4 indices in the strict compact mode, it will always be executed with indices from a single group. If you ask for 8 cpus in the same way, it will always be executed with indices from two groups. Example: $ hq submit --resource cpus = \"8 compact!\" ... ` Scatter ( scatter ) - Allocate indices across as many groups as possible in the current worker state. E.g. Let us assume that a worker has 4 groups with 8 indices per group, and you ask for 8 cpus in the scatter mode. If possible in the current situation, HQ tries to run process with 2 cpus on each socket. Example: $ hq submit --resource cpus = \"8 scatter\" ... Non-integer allocation of resources # Amount of the resource may be a non-integer number. E.g. you may ask for 0.5 of a resource. It tells the scheduler that you want to utilize only half of the resource and if another process asks for at most 0.5 of the resource, it may get the same resource. This resource sharing is done on logical of HyperQueue and actual resource sharing is up to tasks. The precision for defining amount is four decimal places. Therefore, the minimal resource amount that you can ask for is 0.0001 . For sum resources, the amount is simply removed from the pool as in the case of integer resources. In the case of an indexed resource, the partial resource is always taken from a single index. It means that if there is an indexed resource with two indices that are both utilized on 0.75, then a task that ask for 0.5 of this resource will not be started, despite there is available 0.5 of the resource in total, because there is no single index that is free at least on 0.5. If non-integer is bigger than 1, than integer part is always satisfied as whole indices and rest is a part of another index. E.g. when you ask for 2.5 of an indexed resource, you will get 2 complete indices and one index allocated on 50%. Note In the current version, policy \"compact!\" is not allowed with non-integer amounts. Resource environment variables # When a task that has resource requests is executed, the following variables are passed to it for each resource request named : HQ_RESOURCE_REQUEST_ contains the amount of requested resources. HQ_RESOURCE_VALUES_ contains the specific resource values allocated for the task as a comma-separated list. This variable is only filled for an indexed resource pool. In case of non-integer amount, the partially allocated index is always the last index. The slash symbol ( / ) in resource name is normalized to underscore ( _ ) when being used in the environment variable name. HQ also sets additional environment variables for various resources with special names: For the resource gpus/nvidia , HQ will set: CUDA_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_nvidia CUDA_DEVICE_ORDER to PCI_BUS_ID For the resource gpus/amd , HQ will set: ROCR_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_amd Resource requests and job arrays # Resource requests are applied to each task of job. For example, if you submit the following: $ hq submit --cpus = 2 --array = 1 -10 then each task will require two cores. Resource variants # A task may have attached more resource requests. There is no command line interface for this feature, but it can be configured through a Job Definition File .","title":"Resources"},{"location":"jobs/resources/#resource-management","text":"Resource management serves for defining arbitrary resources provided by workers and also corresponding resource requests required by tasks. HyperQueue will take care of matching task resource requests so that only workers that can fulfill them will be able to execute such tasks. Some generic resources are automatically detected ; however, users may also define their own resources. From version 0.13.0, CPUs are also managed as other resources, but they have still some extra functionality; therefore, there is a special section about CPU resources . Important Resources in HyperQueue exist on a purely logical level. They can correspond to physical things (like GPUs), but it is the responsibility of the user to make sure that this correspondence makes sense. With exception of CPUs, HyperQueue by itself does not attach any semantics to resources, they are just numbers used for scheduling.","title":"Resource management"},{"location":"jobs/resources/#worker-resources","text":"Each worker has one or mores resources attached. Each resource is a resource pool identified by a name. A resource pool represents some resources provided by a worker; each task can then ask for a part of the resources contained in that pool. There are two kinds of resource pools: Indexed pool : This pool represents an enumerated set of resources represented by strings. Each resource has its own identity. Tasks do not ask for specific values from the set, they just specify how many resources they require and HyperQueue will allocate the specified amount of resources from the pool for each task. This pool is useful for resources that have their own identity, for example individual GPU or FPGA accelerators. HyperQueue guarantees that no individual resource from the indexed pool is allocated to more than a single task at any given time and that a task will not be executed on a worker if it does not currently have enough individual resources to fulfill the resource request of the task. Indexed pool can be defined with groups where indices live in separated groups. Task may then ask for different allocation policies (e.g. use resources from the same or different groups). The main purpose of this is to capture NUMA architectures, each group then represents a socket with cores. Sum pool : This pool represents a resource that has a certain size which is split into individual tasks. A typical example is memory; if a worker has 2000 bytes of memory, it can serve e.g. four tasks, if each task asks for 500 bytes of memory. HyperQueue guarantees that the sum of resource request sizes of running tasks on a worker does not exceed the total size of the sum pool.","title":"Worker resources"},{"location":"jobs/resources/#specifying-worker-resources","text":"You can specify the resource pools of a worker when you start it: $ hq worker start --resource \"=\" --resource \"=\" ... where NAMEi is a name (string ) of the i -th resource pool and DEFi is a definition of the i-th resource pool. You can define resource pools using one of the following formats: [, , ..., ] where VALUE is a string. This defines a an indexed pool with the given values. If you need to enter a string resource that contains special characters ( [ , ] , , , whitespace), you can wrap the value in quotes: [\"foo [,]\", bar, \"my resource\"] . range(-) where START and END are non-negative integers. This defines an indexed pool with numbers in the inclusive range [START, END] . [[, ..., ], [, ..., ], ...] where VALUE is a string. This defines an indexed pool where indices are grouped. x Creates indexed pool with N groups of size M, indices are indexed from 0, (e.g. \"2x3\" is equivalent to [[0, 1, 2], [3, 4, 5] ) sum() where SIZE is a positive integer. This defines a sum pool with the given size. Tip You might encounter a problem in your shell when you try to specify worker resources, because the definition contains parentheses ( () ). In that case just wrap the resource definition in quotes, like this: $ hq worker start --resource \"foo=sum(5)\"","title":"Specifying worker resources"},{"location":"jobs/resources/#resource-names","text":"Resource names are restricted by the following rules: They can only contain ASCII letters and digits ( a-z , A-Z , 0-9 ) and the slash ( / ) symbol. They need to begin with an ASCII letter. These restrictions exist because the resource names are passed as environment variable names to tasks, which often execute shell scripts. However, shells typically do not support environment variables containing anything else than ASCII letters, digits and the underscore symbol. Therefore, HQ limits resource naming to align with the behaviour of the shell. Important HQ will normalize the resource name when passing environment variables to a task (see below ).","title":"Resource names"},{"location":"jobs/resources/#automatically-detected-resources","text":"The following resources are detected automatically if a resource of a given name is not explicitly defined. CPUs are automatically detected as resource named \"cpus\" (more in CPU resources ). GPUs that are available when a worker is started are automatically detected under the following resource names: NVIDIA GPUs are stored the under resource name gpus/nvidia . These GPUs are detected from the environment variable CUDA_VISIBLE_DEVICES or from the procfs filesystem. AMD GPUs are stored under the resource name gpus/amd . These GPUs are detected from the environment variable ROCR_VISIBLE_DEVICES . You can set these environment variables when starting a worker to override the list of available GPUs: $ CUDA_VISIBLE_DEVICES = 2 ,3 hq worker start # The worker will have resource gpus/nvidia=[2,3] RAM of the node is detected as resource \"mem\" in megabytes; i.e. --resource mem=100 asks for 100 MiBs of the memory. If you want to see how is your system seen by a worker without actually starting it, you can start: $ hq worker hwdetect The automatic detection of resources can be disabled by argument --no-detect-resources in hq worker start ... . It disables detection of resources other than \"cpus\"; if resource \"cpus\" are not explicitly defined, it will always be detected.","title":"Automatically detected resources"},{"location":"jobs/resources/#resource-request","text":"When you submit a job, you can define a resource requests with the --resource flag: $ hq submit --resource = --resource = ... Where NAME is a name of the requested resource and the AMOUNT is a positive number defining the size of the request. Tasks with such resource requests will only be executed on workers that fulfill all the specified task requests. Important Notice that task resource requests always ask for an amount of resources required by a task, regardless whether that resource corresponds to an indexed or a sum pool on workers. For example, let's say that a worker has an indexed pool of GPUs: $ hq worker start --resource \"gpus/nvidia=range(1-3)\" And we create two jobs, each with a single task. The first job wants 1 GPU, the second one wants two GPUs. $ hq submit --resource gpus/nvidia = 1 ... $ hq submit --resource gpus/nvidia = 2 ... Then the first job can be allocated e.g. the GPU 2 and the second job can be allocated the GPUs 1 and 3 .","title":"Resource request"},{"location":"jobs/resources/#requesting-all-resources","text":"A task may ask for all given resources of that type by specifying --resource =all . Such a task will be scheduled only on a worker that has at least 1 of such resource and when a task is executed all resources of that type will be given to this task.","title":"Requesting all resources"},{"location":"jobs/resources/#resource-request-strategies","text":"When resource request is defined, after the amount you can define allocation strategy: --resource =\" \" . Specifying strategy has effect only if worker provides indexed resource in groups. If resource is other type, then strategy is ignored. When strategy is not defined then compact is used as default. Compact ( compact ) - Tries to allocate indices in few groups as possible in the current worker state. Example: $ hq submit --resource cpus = \"8 compact\" ... Strict Compact ( compact! ) - Always allocate indices on as few groups as possible for a target node. The task is not executed until the requirement could not be fully fulfilled. E.g. If a worker has 4 indices per a group and you ask for 4 indices in the strict compact mode, it will always be executed with indices from a single group. If you ask for 8 cpus in the same way, it will always be executed with indices from two groups. Example: $ hq submit --resource cpus = \"8 compact!\" ... ` Scatter ( scatter ) - Allocate indices across as many groups as possible in the current worker state. E.g. Let us assume that a worker has 4 groups with 8 indices per group, and you ask for 8 cpus in the scatter mode. If possible in the current situation, HQ tries to run process with 2 cpus on each socket. Example: $ hq submit --resource cpus = \"8 scatter\" ...","title":"Resource request strategies"},{"location":"jobs/resources/#non-integer-allocation-of-resources","text":"Amount of the resource may be a non-integer number. E.g. you may ask for 0.5 of a resource. It tells the scheduler that you want to utilize only half of the resource and if another process asks for at most 0.5 of the resource, it may get the same resource. This resource sharing is done on logical of HyperQueue and actual resource sharing is up to tasks. The precision for defining amount is four decimal places. Therefore, the minimal resource amount that you can ask for is 0.0001 . For sum resources, the amount is simply removed from the pool as in the case of integer resources. In the case of an indexed resource, the partial resource is always taken from a single index. It means that if there is an indexed resource with two indices that are both utilized on 0.75, then a task that ask for 0.5 of this resource will not be started, despite there is available 0.5 of the resource in total, because there is no single index that is free at least on 0.5. If non-integer is bigger than 1, than integer part is always satisfied as whole indices and rest is a part of another index. E.g. when you ask for 2.5 of an indexed resource, you will get 2 complete indices and one index allocated on 50%. Note In the current version, policy \"compact!\" is not allowed with non-integer amounts.","title":"Non-integer allocation of resources"},{"location":"jobs/resources/#resource-environment-variables","text":"When a task that has resource requests is executed, the following variables are passed to it for each resource request named : HQ_RESOURCE_REQUEST_ contains the amount of requested resources. HQ_RESOURCE_VALUES_ contains the specific resource values allocated for the task as a comma-separated list. This variable is only filled for an indexed resource pool. In case of non-integer amount, the partially allocated index is always the last index. The slash symbol ( / ) in resource name is normalized to underscore ( _ ) when being used in the environment variable name. HQ also sets additional environment variables for various resources with special names: For the resource gpus/nvidia , HQ will set: CUDA_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_nvidia CUDA_DEVICE_ORDER to PCI_BUS_ID For the resource gpus/amd , HQ will set: ROCR_VISIBLE_DEVICES to the same value as HQ_RESOURCE_VALUES_gpus_amd","title":"Resource environment variables"},{"location":"jobs/resources/#resource-requests-and-job-arrays","text":"Resource requests are applied to each task of job. For example, if you submit the following: $ hq submit --cpus = 2 --array = 1 -10 then each task will require two cores.","title":"Resource requests and job arrays"},{"location":"jobs/resources/#resource-variants","text":"A task may have attached more resource requests. There is no command line interface for this feature, but it can be configured through a Job Definition File .","title":"Resource variants"},{"location":"jobs/streaming/","text":"Jobs containing many tasks will generate a large amount of stdout and stderr files, which can be problematic, especially on network-based shared filesystems, such as Lustre. For example, when you submit the following task array: $ hq submit --array = 1 -10000 my-computation.sh 20000 files ( 10000 for stdout and 10000 for stderr) will be created on the disk. To avoid this situation, HyperQueue can optionally stream the stdout and stderr output of tasks into a compact format that do not create a file per task. Note In this section, we refer to stdout and stderr as channels . Redirecting output to the stream # You can redirect the output of stdout and stderr to a log file and thus enable output streaming by passing a path to a filename where the log will be stored with the --stream option: $ hq submit --stream= --array=1-10_000 ... Output log path has to be a directory and it the user responsibility to ensure existence of the directory and visibility of each worker. This command would cause the stdout and stderr of all 10_000 tasks to be streamed into the server, which will write them to files in . The streamed data is written in a compact way independently on the number of tasks. The format also contains additional metadata, which allows the resulting file to be filtered/sorted by tasks or channel. Tip You can use selected placeholders inside the stream path. Partial redirection # By default, both stdout and stderr will be streamed if you specify --stream and do not specify an explicit path for stdout and stderr . To stream only one of the channels, you can use the --stdout / --stderr options to redirect one of them to a file or to disable it completely. For example: # Redirecting stdout into a file, streaming stderr into `my-log` $ hq submit --stream = my-log --stdout = \"stdout-%{TASK_ID}\" ... # Streaming stdout into `my-log`, disabling stderr $ hq submit --stream = my-log --stderr = none ... Guarantees # HyperQueue provides the following guarantees regarding output streaming: When a task is Finished or Failed it is guaranteed that all data produced by the task is flushed into the streaming file. With the following two exceptions: If the streaming itself fails (e.g. because there was insufficient disk space for the stream file), then the task will fail with an error prefixed with \"Streamer:\" and no streaming guarantees will be upheld. When a task is Canceled or task fails because of time limit is reached, then the part of its stream that was buffered in the worker is dropped to avoid spending additional resources for this task. Inspecting the stream files # HyperQueue lets you inspect the data stored inside the stream file using various subcommands. All these commands have the following structure: $ hq output-log Stream summary # You can display a summary of a log file using the summary subcommand: $ hq output-log summary Stream jobs # To print all job IDs that streaming in the stream path, you can run the following command: $ hq output-log jobs Printing stream content # If you want to simply print the (textual) content of the log file, without any associating metadata, you can use the cat subcommand: $ hq output-log cat It will print the raw content of either stdout or stderr , ordered by task id. All outputs will be concatenated one after another. You can use this to process the streamed data e.g. by a postprocessing script. By default, this command will fail if there is an unfinished stream (i.e. when some task is still running and streaming data into the log). If you want to use cat even when the log is not finished yet, use the --allow-unfinished option. If you want to see the output of a specific task, you can use the --task= option. Stream metadata # If you want to inspect the contents of the log, along with its inner metadata that shows which task and which channel has produced which part of the data, you can use the show subcommand: $ hq output-log show The output will have the form J.T:C> DATA where J is a job id, T is a task id and C is 0 for stdout channel and 1 for stderr channel. You can filter a specific channel with the --channel=stdout/stderr flag. Exporting log # Log can be exported into JSON by the following command: $ hq output-log export This prints the log file into a JSON format on standard output. Superseded streams # When a worker crashes while executing a task, the task will be restarted . HyperQueue gives each run of task a difference INSTANCE_ID, and it is a part of stream metadata, hence HyperQueue streaming is able to avoid mixing outputs from different executions of the same task, when a task is restarted. HyperQueue automatically marks all output from previous instance of a task except the last instance as superseded . You can see statistics about superseded data via hq output-log summary command. In the current version, superseded data is ignored by all other commands. More server instances # HyperQueue supports writing streams from the different server instances into the same directory. If you run hq output-log commands over such directory then it will detect the situation and prints all server uids that writes into the directory. You have to specify the server instance via hq output-log --server-uid= ... when working with such a output log directory. Note When a server is restored from a journal file, it will maintain the same server UID. When a server is started \"from a scratch\" a new server uid is generated. Working with non-shared file system # You do not need to have a shared file system when working with streaming. It is just your responsibility to collect all generated files into one directory before using hq output-log commands.","title":"Output Streaming"},{"location":"jobs/streaming/#redirecting-output-to-the-stream","text":"You can redirect the output of stdout and stderr to a log file and thus enable output streaming by passing a path to a filename where the log will be stored with the --stream option: $ hq submit --stream= --array=1-10_000 ... Output log path has to be a directory and it the user responsibility to ensure existence of the directory and visibility of each worker. This command would cause the stdout and stderr of all 10_000 tasks to be streamed into the server, which will write them to files in . The streamed data is written in a compact way independently on the number of tasks. The format also contains additional metadata, which allows the resulting file to be filtered/sorted by tasks or channel. Tip You can use selected placeholders inside the stream path.","title":"Redirecting output to the stream"},{"location":"jobs/streaming/#partial-redirection","text":"By default, both stdout and stderr will be streamed if you specify --stream and do not specify an explicit path for stdout and stderr . To stream only one of the channels, you can use the --stdout / --stderr options to redirect one of them to a file or to disable it completely. For example: # Redirecting stdout into a file, streaming stderr into `my-log` $ hq submit --stream = my-log --stdout = \"stdout-%{TASK_ID}\" ... # Streaming stdout into `my-log`, disabling stderr $ hq submit --stream = my-log --stderr = none ...","title":"Partial redirection"},{"location":"jobs/streaming/#guarantees","text":"HyperQueue provides the following guarantees regarding output streaming: When a task is Finished or Failed it is guaranteed that all data produced by the task is flushed into the streaming file. With the following two exceptions: If the streaming itself fails (e.g. because there was insufficient disk space for the stream file), then the task will fail with an error prefixed with \"Streamer:\" and no streaming guarantees will be upheld. When a task is Canceled or task fails because of time limit is reached, then the part of its stream that was buffered in the worker is dropped to avoid spending additional resources for this task.","title":"Guarantees"},{"location":"jobs/streaming/#inspecting-the-stream-files","text":"HyperQueue lets you inspect the data stored inside the stream file using various subcommands. All these commands have the following structure: $ hq output-log ","title":"Inspecting the stream files"},{"location":"jobs/streaming/#stream-summary","text":"You can display a summary of a log file using the summary subcommand: $ hq output-log summary","title":"Stream summary"},{"location":"jobs/streaming/#stream-jobs","text":"To print all job IDs that streaming in the stream path, you can run the following command: $ hq output-log jobs","title":"Stream jobs"},{"location":"jobs/streaming/#printing-stream-content","text":"If you want to simply print the (textual) content of the log file, without any associating metadata, you can use the cat subcommand: $ hq output-log cat It will print the raw content of either stdout or stderr , ordered by task id. All outputs will be concatenated one after another. You can use this to process the streamed data e.g. by a postprocessing script. By default, this command will fail if there is an unfinished stream (i.e. when some task is still running and streaming data into the log). If you want to use cat even when the log is not finished yet, use the --allow-unfinished option. If you want to see the output of a specific task, you can use the --task= option.","title":"Printing stream content"},{"location":"jobs/streaming/#stream-metadata","text":"If you want to inspect the contents of the log, along with its inner metadata that shows which task and which channel has produced which part of the data, you can use the show subcommand: $ hq output-log show The output will have the form J.T:C> DATA where J is a job id, T is a task id and C is 0 for stdout channel and 1 for stderr channel. You can filter a specific channel with the --channel=stdout/stderr flag.","title":"Stream metadata"},{"location":"jobs/streaming/#exporting-log","text":"Log can be exported into JSON by the following command: $ hq output-log export This prints the log file into a JSON format on standard output.","title":"Exporting log"},{"location":"jobs/streaming/#superseded-streams","text":"When a worker crashes while executing a task, the task will be restarted . HyperQueue gives each run of task a difference INSTANCE_ID, and it is a part of stream metadata, hence HyperQueue streaming is able to avoid mixing outputs from different executions of the same task, when a task is restarted. HyperQueue automatically marks all output from previous instance of a task except the last instance as superseded . You can see statistics about superseded data via hq output-log summary command. In the current version, superseded data is ignored by all other commands.","title":"Superseded streams"},{"location":"jobs/streaming/#more-server-instances","text":"HyperQueue supports writing streams from the different server instances into the same directory. If you run hq output-log commands over such directory then it will detect the situation and prints all server uids that writes into the directory. You have to specify the server instance via hq output-log --server-uid= ... when working with such a output log directory. Note When a server is restored from a journal file, it will maintain the same server UID. When a server is started \"from a scratch\" a new server uid is generated.","title":"More server instances"},{"location":"jobs/streaming/#working-with-non-shared-file-system","text":"You do not need to have a shared file system when working with streaming. It is just your responsibility to collect all generated files into one directory before using hq output-log commands.","title":"Working with non-shared file system"},{"location":"python/","text":"Python API # To provide greater flexibility and support use-cases that are difficult to express using the CLI such as dynamically submitting tasks when some part is finished. Python API covers all task definition including all options available through Job Definition File (dependencies between tasks, resource variants, etc) You can find the HyperQueue Python API reference here . Requirements # To use the Python API, you will need at least Python 3.6 and some dependencies that will be installed automatically using pip. Installation # You can install the HyperQueue Python API from PyPi with the following command: $ python3 -m pip install hyperqueue The Python package contains a pre-compiled version of HyperQueue, so you do not have to download hq manually if you just want to use the Python API. Warning The Python API is currently distributed only for the x86-x64 architecture. If you need a build for another architecture, please contact us on GitHub. You can also build the Python package manually from our GitHub repository, but you will need to install a Rust toolchain for that. Quick start # Here is a minimal code example that spawns a local HyperQueue cluster and uses it to submit a simple job: from hyperqueue import Job , LocalCluster # Spawn a HQ server with LocalCluster () as cluster : # Add a single HyperQueue worker to the server cluster . start_worker () # Create a client and a job client = cluster . client () job = Job () # Add a task that executes `ls` to the job job . program ([ \"ls\" ]) # Submit the job submitted = client . submit ( job ) # Wait until the job completes client . wait_for_jobs ([ submitted ])","title":"Getting started"},{"location":"python/#python-api","text":"To provide greater flexibility and support use-cases that are difficult to express using the CLI such as dynamically submitting tasks when some part is finished. Python API covers all task definition including all options available through Job Definition File (dependencies between tasks, resource variants, etc) You can find the HyperQueue Python API reference here .","title":"Python API"},{"location":"python/#requirements","text":"To use the Python API, you will need at least Python 3.6 and some dependencies that will be installed automatically using pip.","title":"Requirements"},{"location":"python/#installation","text":"You can install the HyperQueue Python API from PyPi with the following command: $ python3 -m pip install hyperqueue The Python package contains a pre-compiled version of HyperQueue, so you do not have to download hq manually if you just want to use the Python API. Warning The Python API is currently distributed only for the x86-x64 architecture. If you need a build for another architecture, please contact us on GitHub. You can also build the Python package manually from our GitHub repository, but you will need to install a Rust toolchain for that.","title":"Installation"},{"location":"python/#quick-start","text":"Here is a minimal code example that spawns a local HyperQueue cluster and uses it to submit a simple job: from hyperqueue import Job , LocalCluster # Spawn a HQ server with LocalCluster () as cluster : # Add a single HyperQueue worker to the server cluster . start_worker () # Create a client and a job client = cluster . client () job = Job () # Add a task that executes `ls` to the job job . program ([ \"ls\" ]) # Submit the job submitted = client . submit ( job ) # Wait until the job completes client . wait_for_jobs ([ submitted ])","title":"Quick start"},{"location":"python/client/","text":"Client # To submit jobs using the Python API, you first need to create a Client that connects to a running HyperQueue cluster. You have two options of deploying the cluster. Once you have an instance of a Client , you can use it to submit a job. Using external deployment # If you want to run the HyperQueue infrastructure on a distributed cluster or you want to use automatic allocation , then deploy HyperQueue in any of the supported ways and then pass the server directory to the Client : from hyperqueue import Client client = Client ( \"/home/user/.hq-server/hq-current\" ) If you have used the default server directory and the server is deployed on a file-system shared by the node that executes the Python code, you can simply create an instance of a Client without passing any parameters. Using a local cluster # You can use the LocalCluster class to spawn a HyperQueue server and a set of workers directly on your local machine. This functionality is primarily intended for local prototyping and debugging, but it can also be used for actual computations for simple use-cases that do not require a distributed deployment of HyperQueue. When you create the cluster, it will initially only start the HyperQueue server. To connect workers to it, use the start_worker method. from hyperqueue import LocalCluster from hyperqueue.cluster import WorkerConfig with LocalCluster () as cluster : # Add a worker with 4 cores to the cluster cluster . start_worker ( WorkerConfig ( cores = 4 )) # Create a client connected to the cluster client = cluster . client () Tip You can use LocalCluster instances as context managers to make sure that the cluster is properly cleaned up at the end of the with block.","title":"Client"},{"location":"python/client/#client","text":"To submit jobs using the Python API, you first need to create a Client that connects to a running HyperQueue cluster. You have two options of deploying the cluster. Once you have an instance of a Client , you can use it to submit a job.","title":"Client"},{"location":"python/client/#using-external-deployment","text":"If you want to run the HyperQueue infrastructure on a distributed cluster or you want to use automatic allocation , then deploy HyperQueue in any of the supported ways and then pass the server directory to the Client : from hyperqueue import Client client = Client ( \"/home/user/.hq-server/hq-current\" ) If you have used the default server directory and the server is deployed on a file-system shared by the node that executes the Python code, you can simply create an instance of a Client without passing any parameters.","title":"Using external deployment"},{"location":"python/client/#using-a-local-cluster","text":"You can use the LocalCluster class to spawn a HyperQueue server and a set of workers directly on your local machine. This functionality is primarily intended for local prototyping and debugging, but it can also be used for actual computations for simple use-cases that do not require a distributed deployment of HyperQueue. When you create the cluster, it will initially only start the HyperQueue server. To connect workers to it, use the start_worker method. from hyperqueue import LocalCluster from hyperqueue.cluster import WorkerConfig with LocalCluster () as cluster : # Add a worker with 4 cores to the cluster cluster . start_worker ( WorkerConfig ( cores = 4 )) # Create a client connected to the cluster client = cluster . client () Tip You can use LocalCluster instances as context managers to make sure that the cluster is properly cleaned up at the end of the with block.","title":"Using a local cluster"},{"location":"python/dependencies/","text":"Task dependencies # One of the most useful features of the HyperQueue Python API is that it allows you to define dependencies between individual tasks of a job. If a task B depends on task A , then B will not be executed until A has (successfully) finished. Using dependencies, you can describe arbitrarily complex DAG (directed acyclic graph) workflows. Notice HyperQueue jobs are independent of each other, so dependencies can only be specified between tasks within a single job. Defining dependencies # To define a dependency between tasks, you will first need to store the Task instances that you get when you create a task . You can then use the deps parameter when creating a new task and pass an existing task instance to define a dependency: from hyperqueue import Job job = Job () # Create a first task that generates data task_a = job . program ([ \"generate-data\" , \"--file\" , \"out.txt\" ]) # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out.txt\" ], deps = [ task_a ]) The second task will not be started until the first one successfully finishes. You can also depend on multiple tasks at once: # Create several tasks that generate data tasks = [ job . program ([ \"generate-data\" , \"--file\" , f \"out- { i } .txt\" ]) for i in range ( 5 )] # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out- %d .txt\" ], deps = [ tasks ]) Dependencies are transitive, so you can build an arbitrary graph: task_a = job . program ([ \"generate\" , \"1\" ]) task_b = job . program ([ \"generate\" , \"2\" ]) task_c = job . program ([ \"compute\" ], deps = [ task_a , task_b ]) task_d = job . program ([ \"postprocess\" ], deps = [ task_c ]) In this case, task D will not start until all the three previous tasks are successfully finished.","title":"Dependencies"},{"location":"python/dependencies/#task-dependencies","text":"One of the most useful features of the HyperQueue Python API is that it allows you to define dependencies between individual tasks of a job. If a task B depends on task A , then B will not be executed until A has (successfully) finished. Using dependencies, you can describe arbitrarily complex DAG (directed acyclic graph) workflows. Notice HyperQueue jobs are independent of each other, so dependencies can only be specified between tasks within a single job.","title":"Task dependencies"},{"location":"python/dependencies/#defining-dependencies","text":"To define a dependency between tasks, you will first need to store the Task instances that you get when you create a task . You can then use the deps parameter when creating a new task and pass an existing task instance to define a dependency: from hyperqueue import Job job = Job () # Create a first task that generates data task_a = job . program ([ \"generate-data\" , \"--file\" , \"out.txt\" ]) # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out.txt\" ], deps = [ task_a ]) The second task will not be started until the first one successfully finishes. You can also depend on multiple tasks at once: # Create several tasks that generate data tasks = [ job . program ([ \"generate-data\" , \"--file\" , f \"out- { i } .txt\" ]) for i in range ( 5 )] # Create a dependent task that consumes the data job . program ([ \"consume-data\" , \"--file\" , \"out- %d .txt\" ], deps = [ tasks ]) Dependencies are transitive, so you can build an arbitrary graph: task_a = job . program ([ \"generate\" , \"1\" ]) task_b = job . program ([ \"generate\" , \"2\" ]) task_c = job . program ([ \"compute\" ], deps = [ task_a , task_b ]) task_d = job . program ([ \"postprocess\" ], deps = [ task_c ]) In this case, task D will not start until all the three previous tasks are successfully finished.","title":"Defining dependencies"},{"location":"python/submit/","text":"Submitting jobs # You can use the Python API to submit jobs (directed acyclic graphs of tasks) through a Client . In addition to the functionality offered by the HyperQueue CLI, you can use the Python API to add dependencies between jobs, configure each task individually and create tasks out of Python functions . Job # To build a job, you first have to create an instance of the Job class. from hyperqueue import Job job = Job () Tasks # Once you have created a job, you can add tasks to it. Currently, each task can represent either the execution of an external program or the execution of a Python function . To create complex workflows, you can also specify dependencies between tasks. External programs # To create a task that will execute an external program, you can use the program method of a Job : job . program ([ \"/bin/my-program\" , \"foo\" , \"bar\" , \"--arg\" , \"42\" ]) You can pass the program arguments or various other parameters to the task. The program method will return a Task object that represents the created task. This object can be used further e.g. for defining dependencies . Python functions # If you want to execute a Python function as a task, you can use the function method of a Job : def preprocess_data ( fast , path ): with open ( path ) as f : data = f . read () if fast : preprocess_fast ( data ) else : preprocess ( data ) job . function ( preprocess_data , args = ( True , \"/data/a.txt\" )) job . function ( preprocess_data , args = ( False , \"/data/b.txt\" )) You can pass both positional and keyword arguments to the function. The arguments will be serialized using cloudpickle . Python tasks can be useful to perform e.g. various data preprocessing and organization tasks. You can co-locate the logic of Python tasks together with the code that defines the submitted workflow (job), without the need to write an additional external script. Same as with the program method, function will return a Task that can used to define dependencies . Notice Currently, a new Python interpreter will be started for each Python task. Python environment # When you use a Python function as a task, the task will attempt to import the hyperqueue package when it executes (to perform some bookkeeping on the background). This function will be executed on a worker - this means that it needs to have access to the correct Python version (and virtual environment) that contains the hyperqueue package! To make sure that the function will be executed in the correct Python environment, you can use PythonEnv and its prologue argument. It lets you specify a (shell) command that will be executed before the Python interpreter that executes your function is spawned. from hyperqueue.task.function import PythonEnv from hyperqueue import Client env = PythonEnv ( prologue = \"ml Python/XYZ && source //bin/activate\" ) client = Client ( python_env = env ) If you use Python functions as tasks, it is pretty much required to use PythonEnv , unless your workers are already spawned in an environment that has the correct Python loaded (e.g. using .bashrc or a similar mechanism). Parametrizing tasks # You can parametrize both external or Python tasks by setting their working directory, standard output paths, environment variables or HyperQueue specific parameters like resources or time limits . In contrast to the CLI, where you can only use a single set of parameters for all tasks of a job, with the Python API you can specify these parameters individually for each task. You can find more details in the documentation of the program or function methods. Submitting a job # Once you have added some tasks to the job, you can submit it using the Client 's submit method: client = Client () submitted = client . submit ( job ) To wait until the job has finished executing, use the wait_for_jobs method: client . wait_for_jobs ([ submitted ])","title":"Submitting jobs"},{"location":"python/submit/#submitting-jobs","text":"You can use the Python API to submit jobs (directed acyclic graphs of tasks) through a Client . In addition to the functionality offered by the HyperQueue CLI, you can use the Python API to add dependencies between jobs, configure each task individually and create tasks out of Python functions .","title":"Submitting jobs"},{"location":"python/submit/#job","text":"To build a job, you first have to create an instance of the Job class. from hyperqueue import Job job = Job ()","title":"Job"},{"location":"python/submit/#tasks","text":"Once you have created a job, you can add tasks to it. Currently, each task can represent either the execution of an external program or the execution of a Python function . To create complex workflows, you can also specify dependencies between tasks.","title":"Tasks"},{"location":"python/submit/#external-programs","text":"To create a task that will execute an external program, you can use the program method of a Job : job . program ([ \"/bin/my-program\" , \"foo\" , \"bar\" , \"--arg\" , \"42\" ]) You can pass the program arguments or various other parameters to the task. The program method will return a Task object that represents the created task. This object can be used further e.g. for defining dependencies .","title":"External programs"},{"location":"python/submit/#python-functions","text":"If you want to execute a Python function as a task, you can use the function method of a Job : def preprocess_data ( fast , path ): with open ( path ) as f : data = f . read () if fast : preprocess_fast ( data ) else : preprocess ( data ) job . function ( preprocess_data , args = ( True , \"/data/a.txt\" )) job . function ( preprocess_data , args = ( False , \"/data/b.txt\" )) You can pass both positional and keyword arguments to the function. The arguments will be serialized using cloudpickle . Python tasks can be useful to perform e.g. various data preprocessing and organization tasks. You can co-locate the logic of Python tasks together with the code that defines the submitted workflow (job), without the need to write an additional external script. Same as with the program method, function will return a Task that can used to define dependencies . Notice Currently, a new Python interpreter will be started for each Python task.","title":"Python functions"},{"location":"python/submit/#python-environment","text":"When you use a Python function as a task, the task will attempt to import the hyperqueue package when it executes (to perform some bookkeeping on the background). This function will be executed on a worker - this means that it needs to have access to the correct Python version (and virtual environment) that contains the hyperqueue package! To make sure that the function will be executed in the correct Python environment, you can use PythonEnv and its prologue argument. It lets you specify a (shell) command that will be executed before the Python interpreter that executes your function is spawned. from hyperqueue.task.function import PythonEnv from hyperqueue import Client env = PythonEnv ( prologue = \"ml Python/XYZ && source //bin/activate\" ) client = Client ( python_env = env ) If you use Python functions as tasks, it is pretty much required to use PythonEnv , unless your workers are already spawned in an environment that has the correct Python loaded (e.g. using .bashrc or a similar mechanism).","title":"Python environment"},{"location":"python/submit/#parametrizing-tasks","text":"You can parametrize both external or Python tasks by setting their working directory, standard output paths, environment variables or HyperQueue specific parameters like resources or time limits . In contrast to the CLI, where you can only use a single set of parameters for all tasks of a job, with the Python API you can specify these parameters individually for each task. You can find more details in the documentation of the program or function methods.","title":"Parametrizing tasks"},{"location":"python/submit/#submitting-a-job","text":"Once you have added some tasks to the job, you can submit it using the Client 's submit method: client = Client () submitted = client . submit ( job ) To wait until the job has finished executing, use the wait_for_jobs method: client . wait_for_jobs ([ submitted ])","title":"Submitting a job"}]} \ No newline at end of file diff --git a/latest/sitemap.xml b/latest/sitemap.xml index b3622e7d7..d5107616d 100644 --- a/latest/sitemap.xml +++ b/latest/sitemap.xml @@ -2,152 +2,152 @@ https://it4innovations.github.io/hyperqueue/stable/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/cheatsheet/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/faq/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/installation/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/other-tools/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/quickstart/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/cli/dashboard/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/cli/output-mode/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/cli/shortcuts/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/deployment/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/deployment/allocation/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/deployment/cloud/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/deployment/server/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/deployment/worker/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/examples/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/examples/iterative-computation/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/arrays/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/cresources/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/directives/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/failure/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/jobfile/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/jobs/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/multinode/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/openjobs/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/resources/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/jobs/streaming/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/python/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/python/client/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/python/dependencies/ - 2024-10-15 + 2024-10-17 daily https://it4innovations.github.io/hyperqueue/stable/python/submit/ - 2024-10-15 + 2024-10-17 daily \ No newline at end of file diff --git a/latest/sitemap.xml.gz b/latest/sitemap.xml.gz index cada026b22782b35fe62a3f3b2db63dcb80ef9b9..a096221c63e013ada6ed594bdbe1796f32c5629f 100644 GIT binary patch literal 452 zcmV;#0XzO5iwFo7un=bg|8r?{Wo=<_E_iKh0M(aWj-xOP$KU4^74MLCG}3;EGH-i= zodbm6lI$cgwVigj{iGeV>H$_OIS*j0Kjw#JjMKkg7m5tBtK!VZ#)~XJjk#X09S4DA7w(W|!_L?>_ONdc z->PKuJ;;Xh)nxv;_>KvZF$>{%6W$kmAPu2cn%;(=O0 zjI4_LT!MSAHtDKs7lWBB`w=Zo>d3LkQlzakppKnAlDHze%6x();+i0fR_xjp(dp}p z4{zWimPso-1J$`oEBzJeT^T)m*tA4oozQh;rmDX{>oC<`ukXYCgCiSDWLMV5?FbkP zIZ((x1p;>~9fj9wvd*L{?uje}HibN^LSI(# u%KrKra^2`@+bV73b?AjWL3Tk|op1bH#uWZT=Ic(G!~yfMIduDG2if5i@_cOG ze*bE}H?PA}Unnxju8K1s8!xi_H0FA}b{qtfUARM<4Le^i*lbR-?|wAH)2^;tfG_L5 z`iO40Md2DI$VFIFl8cXJS5#x9`o>jHExP{6R*F@ovM?Z(GAmL=gdJnx@h+QYs* ze5;bp_aJlL<{3@|=V7uTuDyBid;yxjg#U=O=E4el08xc~vS(GiAy*^DyHXW6iU(>1 zF|sP|a|!Of+N7(lT?}Tj>_@aTsUycCOOdwHfI4>eNaBjd8SSGFT3{>YTt@KxeC8=KJa=tg{Tu;VV$HBC1h>m?;RZ%$U2j*xF@m@*c9@t3Vm6{ uEBotj$aSNqZL74A*P$2k1la{;b-wX)8B_QVnZKVHWcUQT-Fq)I761Sa{oq&t