From 5e8b6ddbff6de74eecc948892cbb3ea3ded1d266 Mon Sep 17 00:00:00 2001 From: yu lin <735122171@qq.com> Date: Thu, 11 Jul 2024 16:51:21 +0800 Subject: [PATCH] Support setting shared memory for training job. (#1104) Signed-off-by: Syulin7 <735122171@qq.com> --- Dockerfile.install | 6 +-- Dockerfile.notebook.cpu | 6 +-- Dockerfile.notebook.kubeflow | 6 +-- charts/pytorchjob/templates/pytorchjob.yaml | 12 +++--- charts/pytorchjob/values.yaml | 1 - charts/tfjob/templates/tfjob.yaml | 41 ++++++++++++++++++++- pkg/apis/training/pytorchjob_builder.go | 7 ++++ pkg/apis/training/tfjob_builder.go | 7 ++++ pkg/apis/types/submit_pytorchjob.go | 3 ++ pkg/apis/types/submit_tfjob.go | 2 + pkg/argsbuilder/submit_pytorchjob.go | 7 ++++ pkg/argsbuilder/submit_tfjob.go | 7 ++++ 12 files changed, 88 insertions(+), 17 deletions(-) diff --git a/Dockerfile.install b/Dockerfile.install index 0b12337a5..caf5eb3f5 100644 --- a/Dockerfile.install +++ b/Dockerfile.install @@ -8,12 +8,12 @@ COPY . . RUN make -RUN wget https://get.helm.sh/helm-v2.14.1-linux-amd64.tar.gz && \ - tar -xvf helm-v2.14.1-linux-amd64.tar.gz && \ +RUN wget https://get.helm.sh/helm-v3.13.3-linux-amd64.tar.gz && \ + tar -xvf helm-v3.13.3-linux-amd64.tar.gz && \ mv linux-amd64/helm /usr/local/bin/helm && \ chmod u+x /usr/local/bin/helm -ENV K8S_VERSION v1.13.6 +ENV K8S_VERSION v1.28.4 RUN curl -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl diff --git a/Dockerfile.notebook.cpu b/Dockerfile.notebook.cpu index da33a833e..a5db3f119 100644 --- a/Dockerfile.notebook.cpu +++ b/Dockerfile.notebook.cpu @@ -12,12 +12,12 @@ COPY . . RUN make -RUN wget https://get.helm.sh/helm-v2.14.1-linux-amd64.tar.gz && \ - tar -xvf helm-v2.14.1-linux-amd64.tar.gz && \ +RUN wget https://get.helm.sh/helm-v3.13.3-linux-amd64.tar.gz && \ + tar -xvf helm-v3.13.3-linux-amd64.tar.gz && \ mv linux-amd64/helm /usr/local/bin/helm && \ chmod u+x /usr/local/bin/helm -ENV K8S_VERSION v1.13.6 +ENV K8S_VERSION v1.28.4 RUN curl -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl FROM $BASE_IMAGE diff --git a/Dockerfile.notebook.kubeflow b/Dockerfile.notebook.kubeflow index 97bf6140b..798f66090 100644 --- a/Dockerfile.notebook.kubeflow +++ b/Dockerfile.notebook.kubeflow @@ -11,12 +11,12 @@ COPY . . RUN make -RUN wget https://get.helm.sh/helm-v2.14.1-linux-amd64.tar.gz && \ - tar -xvf helm-v2.14.1-linux-amd64.tar.gz && \ +RUN wget https://get.helm.sh/helm-v3.13.3-linux-amd64.tar.gz && \ + tar -xvf helm-v3.13.3-linux-amd64.tar.gz && \ mv linux-amd64/helm /usr/local/bin/helm && \ chmod u+x /usr/local/bin/helm -ENV K8S_VERSION v1.13.6 +ENV K8S_VERSION v1.28.4 RUN curl -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/${K8S_VERSION}/bin/linux/amd64/kubectl && chmod +x /usr/local/bin/kubectl FROM $BASE_IMAGE diff --git a/charts/pytorchjob/templates/pytorchjob.yaml b/charts/pytorchjob/templates/pytorchjob.yaml index 70d8ce62b..97fc2a479 100644 --- a/charts/pytorchjob/templates/pytorchjob.yaml +++ b/charts/pytorchjob/templates/pytorchjob.yaml @@ -160,11 +160,11 @@ spec: name: {{ .name }} {{- end }} {{- end }} - {{- if .Values.shmSize }} + {{- if .Values.shareMemory }} - name: dshm emptyDir: medium: Memory - sizeLimit: {{ .Values.shmSize }} + sizeLimit: {{ .Values.shareMemory }} {{- end }} {{- if .Values.syncMode }} initContainers: @@ -324,7 +324,7 @@ spec: mountPath: "{{ $destPath }}" {{- end }} {{- end }} - {{- if .Values.shmSize }} + {{- if .Values.shareMemory }} - mountPath: /dev/shm name: dshm {{- end }} @@ -452,11 +452,11 @@ spec: name: {{ .name }} {{- end }} {{- end }} - {{- if .Values.shmSize }} + {{- if .Values.shareMemory }} - name: dshm emptyDir: medium: Memory - sizeLimit: {{ .Values.shmSize }} + sizeLimit: {{ .Values.shareMemory }} {{- end }} {{- if .Values.syncMode }} initContainers: @@ -616,7 +616,7 @@ spec: mountPath: "{{ $destPath }}" {{- end }} {{- end }} - {{- if .Values.shmSize }} + {{- if .Values.shareMemory }} - mountPath: /dev/shm name: dshm {{- end }} diff --git a/charts/pytorchjob/values.yaml b/charts/pytorchjob/values.yaml index 8d9d15b4c..6b2469bc4 100644 --- a/charts/pytorchjob/values.yaml +++ b/charts/pytorchjob/values.yaml @@ -12,7 +12,6 @@ rsyncImage: registry.cn-zhangjiakou.aliyuncs.com/acs/rsync:v3.1.0-aliyun # git sync image gitImage: registry.cn-zhangjiakou.aliyuncs.com/acs/git-sync:v3.3.5 -shmSize: 2Gi privileged: false useTensorboard: false diff --git a/charts/tfjob/templates/tfjob.yaml b/charts/tfjob/templates/tfjob.yaml index 7ec3b0c78..f86678536 100644 --- a/charts/tfjob/templates/tfjob.yaml +++ b/charts/tfjob/templates/tfjob.yaml @@ -204,6 +204,12 @@ spec: name: {{ .name }} {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shareMemory }} + {{- end }} {{- if .Values.syncMode }} initContainers: - name: init-code @@ -357,6 +363,10 @@ spec: mountPath: "{{ $destPath }}" {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - mountPath: /dev/shm + name: dshm + {{- end }} {{- if $dataDirs }} {{- range $dataDirs }} - mountPath: {{ .containerPath }} @@ -530,6 +540,12 @@ spec: name: {{ .name }} {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shareMemory }} + {{- end }} {{- if .Values.syncMode }} initContainers: - name: init-code @@ -709,7 +725,10 @@ spec: name: {{ .name }} {{- end }} {{- end }} - + {{- if .Values.shareMemory }} + - mountPath: /dev/shm + name: dshm + {{- end }} {{- end }} {{- if .Values.chief }} {{ .Values.chiefName | indent 4}}: @@ -873,6 +892,12 @@ spec: name: {{ .name }} {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shareMemory }} + {{- end }} {{- if .Values.syncMode }} initContainers: - name: init-code @@ -1050,6 +1075,10 @@ spec: name: {{ .name }} {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - mountPath: /dev/shm + name: dshm + {{- end }} {{- end }} {{- if .Values.evaluator }} Evaluator: @@ -1175,6 +1204,12 @@ spec: name: {{ .name }} {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shareMemory }} + {{- end }} {{- if .Values.syncMode }} initContainers: - name: init-code @@ -1330,4 +1365,8 @@ spec: name: {{ .name }} {{- end }} {{- end }} + {{- if .Values.shareMemory }} + - mountPath: /dev/shm + name: dshm + {{- end }} {{- end }} diff --git a/pkg/apis/training/pytorchjob_builder.go b/pkg/apis/training/pytorchjob_builder.go index 4fcacc975..8658bba64 100644 --- a/pkg/apis/training/pytorchjob_builder.go +++ b/pkg/apis/training/pytorchjob_builder.go @@ -296,6 +296,13 @@ func (b *PytorchJobBuilder) TTLSecondsAfterFinished(ttl int32) *PytorchJobBuilde return b } +func (b *PytorchJobBuilder) ShareMemory(shm string) *PytorchJobBuilder { + if shm != "" { + b.args.ShareMemory = shm + } + return b +} + // Build is used to build the job func (b *PytorchJobBuilder) Build() (*Job, error) { for key, value := range b.argValues { diff --git a/pkg/apis/training/tfjob_builder.go b/pkg/apis/training/tfjob_builder.go index b97b8cc6d..3aa71540f 100644 --- a/pkg/apis/training/tfjob_builder.go +++ b/pkg/apis/training/tfjob_builder.go @@ -481,6 +481,13 @@ func (b *TFJobBuilder) TTLSecondsAfterFinished(ttl int32) *TFJobBuilder { return b } +func (b *TFJobBuilder) ShareMemory(shm string) *TFJobBuilder { + if shm != "" { + b.args.ShareMemory = shm + } + return b +} + func (b *TFJobBuilder) Build() (*Job, error) { for key, value := range b.argValues { b.AddArgValue(key, value) diff --git a/pkg/apis/types/submit_pytorchjob.go b/pkg/apis/types/submit_pytorchjob.go index 384fface1..c2356c34b 100644 --- a/pkg/apis/types/submit_pytorchjob.go +++ b/pkg/apis/types/submit_pytorchjob.go @@ -42,4 +42,7 @@ type SubmitPyTorchJobArgs struct { // TrainingOperatorCRD compatible with training-operator crd. TrainingOperatorCRD bool `yaml:"trainingOperatorCRD,omitempty"` + + // ShareMemory Specifies the shared memory size + ShareMemory string `yaml:"shareMemory"` } diff --git a/pkg/apis/types/submit_tfjob.go b/pkg/apis/types/submit_tfjob.go index 058f08037..decb62ae0 100644 --- a/pkg/apis/types/submit_tfjob.go +++ b/pkg/apis/types/submit_tfjob.go @@ -87,6 +87,8 @@ type SubmitTFJobArgs struct { StartingDeadlineSeconds int64 `yaml:"startingDeadlineSeconds,omitempty"` // Defines the TTL for cleaning up finished TFJobs. Defaults to infinite. TTLSecondsAfterFinished int32 `yaml:"ttlSecondsAfterFinished,omitempty"` + // ShareMemory Specifies the shared memory size + ShareMemory string `yaml:"shareMemory"` // for common args CommonSubmitArgs `yaml:",inline"` diff --git a/pkg/argsbuilder/submit_pytorchjob.go b/pkg/argsbuilder/submit_pytorchjob.go index 567a79923..8af5ae88d 100644 --- a/pkg/argsbuilder/submit_pytorchjob.go +++ b/pkg/argsbuilder/submit_pytorchjob.go @@ -82,6 +82,7 @@ func (s *SubmitPytorchJobArgsBuilder) AddCommandFlags(command *cobra.Command) { command.Flags().StringVar(&s.args.Memory, "memory", "", "the memory resource to use for the training, like 1Gi.") command.Flags().DurationVar(&runningTimeout, "running-timeout", runningTimeout, "Specifies the duration since startTime during which the job can remain active before it is terminated(e.g. '5s', '1m', '2h22m').") command.Flags().DurationVar(&ttlAfterFinished, "ttl-after-finished", ttlAfterFinished, "Defines the TTL for cleaning up finished PytorchJobs(e.g. '5s', '1m', '2h22m'). Defaults to infinite.") + command.Flags().StringVar(&s.args.ShareMemory, "share-memory", "2Gi", "the shared memory of each replica to run the job, default 2Gi.") s.AddArgValue("running-timeout", &runningTimeout). AddArgValue("ttl-after-finished", &ttlAfterFinished) @@ -163,6 +164,12 @@ func (s *SubmitPytorchJobArgsBuilder) check() error { if s.args.TTLSecondsAfterFinished < 0 { return fmt.Errorf("--ttl-after-finished is invalid") } + if s.args.ShareMemory != "" { + _, err := resource.ParseQuantity(s.args.ShareMemory) + if err != nil { + return fmt.Errorf("--share-memory is invalid") + } + } return nil } diff --git a/pkg/argsbuilder/submit_tfjob.go b/pkg/argsbuilder/submit_tfjob.go index 787644bd2..81d25d01e 100644 --- a/pkg/argsbuilder/submit_tfjob.go +++ b/pkg/argsbuilder/submit_tfjob.go @@ -170,6 +170,7 @@ func (s *SubmitTFJobArgsBuilder) AddCommandFlags(command *cobra.Command) { command.Flags().StringArrayVar(&evaluatorSelectors, "evaluator-selector", []string{}, `assigning jobs with "Evaluator" role to some k8s particular nodes(this option would cover --selector), usage: "--evaluator-selector=key=value"`) command.Flags().StringArrayVar(&psSelectors, "ps-selector", []string{}, `assigning jobs with "PS" role to some k8s particular nodes(this option would cover --selector), usage: "--ps-selector=key=value"`) command.Flags().StringVar(&roleSequence, "role-sequence", "", `specify the tfjob role sequence,like: "Worker,PS,Chief,Evaluator" or "w,p,c,e"`) + command.Flags().StringVar(&s.args.ShareMemory, "share-memory", "2Gi", "the shared memory of each replica to run the job, default 2Gi.") s.AddArgValue("worker-selector", &workerSelectors). AddArgValue("chief-selector", &chiefSelectors). @@ -342,6 +343,12 @@ func (s *SubmitTFJobArgsBuilder) check() error { if s.args.TTLSecondsAfterFinished < 0 { return fmt.Errorf("--ttl-after-finished is invalid") } + if s.args.ShareMemory != "" { + _, err := resource.ParseQuantity(s.args.ShareMemory) + if err != nil { + return fmt.Errorf("--share-memory is invalid") + } + } return nil }