Skip to content

Commit

Permalink
Merge pull request #21 from konnase/main
Browse files Browse the repository at this point in the history
refactor job spec
  • Loading branch information
konnase authored Jun 30, 2022
2 parents e9d0249 + 949f307 commit 02c5aae
Show file tree
Hide file tree
Showing 68 changed files with 13,182 additions and 11,081 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name: Release
on: [push]

env:
version: v1.0.0
version: v1.1.0

jobs:
docker:
Expand Down
19 changes: 15 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

# di-operator version
VERSION ?= v1.0.0
APP_VERSION ?= 0.1.0
VERSION ?= v1.1.0
MASTER_VERSION := $(VERSION)

COMMIT_SHORT_SHA=$(shell git log -n 1 | head -n 1 | sed -e 's/^commit //' | head -c 8)
Expand Down Expand Up @@ -55,7 +56,7 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=di-operator-cluster-role webhook paths="./..." output:crd:artifacts:config=config/crd/bases
cd config/manager && $(KUSTOMIZE) edit set image ${IMG_BASE}=${MASTER_IMG}
./hack/update-image-tags.sh config/manager ${MASTER_VERSION}
./hack/update-version.sh ${MASTER_VERSION}
./hack/update-version.sh ${MASTER_VERSION} ${APP_VERSION}
## generate installer scripts
$(KUSTOMIZE) build config/default > config/di-manager.yaml

Expand All @@ -66,7 +67,8 @@ dev-manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=di-operator-cluster-role webhook paths="./..." output:crd:artifacts:config=config/crd/bases
cd config/manager && $(KUSTOMIZE) edit set image ${IMG_BASE}=${IMG}
./hack/update-image-tags.sh config/manager ${VERSION}
./hack/update-version.sh ${VERSION}
./hack/update-version.sh ${VERSION} ${APP_VERSION}
$(KUSTOMIZE) build config/default > config/di-manager.yaml

generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
Expand All @@ -78,7 +80,8 @@ vet: ## Run go vet against code.
go vet ./...

# Run golangci-lint
lint:
# lint: golangci-lint
lint:
golangci-lint run -v --timeout=5m

.PHONY: test
Expand All @@ -89,6 +92,10 @@ test: ginkgo ## Run tests.
go tool cover -func=./pkg/common/coverage.out
go tool cover -func=./pkg/controllers/coverage.out

.PHONY: test-e2e
test-e2e: ginkgo dev-deploy ## Run e2e tests
${GINKGO} -cover ./test/e2e

##@ Build

build: generate ## Build di-operator binary.
Expand Down Expand Up @@ -140,6 +147,10 @@ GINKGO = $(shell pwd)/bin/ginkgo
ginkgo: ## Download ginkgo locally if necessary.
$(call go-get-tool,$(GINKGO),github.com/onsi/ginkgo/[email protected])

GOLANGCI_LINT = $(shell pwd)/bin/golangci-lint
golangci-lint: ## Download golangci-lint locally if necessary.
$(call go-get-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/[email protected])

# go-get-tool will 'go get' any package $2 and install it to $1.
PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
define go-get-tool
Expand Down
47 changes: 23 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,33 @@ di-server-7b86ff8df4-jfgmp 1/1 Running 0 59s

```bash
# submit DIJob
$ kubectl create -f config/samples/dijob-gobigger.yaml
$ kubectl create -f config/samples/atari-dqn-tasks.yaml

# get pod and you will see coordinator is created by di-operator
# a few seconds later, you will see collectors and learners created by di-server
$ kubectl get pod
NAME READY STATUS RESTARTS AGE
gobigger-test-0-0 1/1 Running 0 4m17s
gobigger-test-0-1 1/1 Running 0 4m17s

# get logs of coordinator
$ kubectl logs gobigger-test-0-0
Bind subprocesses on these addresses: ['tcp://10.148.3.4:22270',
'tcp://10.148.3.4:22271']
[Warning] no enough data: 128/0
...
[Warning] no enough data: 128/120
Current Training: Train Iter(0) Loss(102.256)
Current Training: Train Iter(0) Loss(103.133)
Current Training: Train Iter(20) Loss(28.795)
Current Training: Train Iter(20) Loss(32.837)
...
Current Training: Train Iter(360) Loss(12.850)
Current Training: Train Iter(340) Loss(11.812)
Current Training: Train Iter(380) Loss(12.892)
Current Training: Train Iter(360) Loss(13.621)
Current Training: Train Iter(400) Loss(15.183)
Current Training: Train Iter(380) Loss(14.187)
Current Evaluation: Train Iter(404) Eval Reward(-1788.326)
NAME READY STATUS RESTARTS AGE
job-with-tasks-collector-0 1/1 Running 0 2s
job-with-tasks-collector-1 1/1 Running 0 2s
job-with-tasks-evaluator-0 1/1 Running 0 2s
job-with-tasks-learner-0 1/1 Running 0 2s

# get logs of tasks
$ kubectl logs job-with-tasks-evaluator-0
/opt/conda/lib/python3.8/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /opt/conda/conda-bld/pytorch_1607370172916/work/c10/cuda/CUDAFunctions.cpp:100.)
return torch._C._cuda_getDeviceCount() > 0
[06-28 08:25:29] INFO Evaluator running on node 1 func.py:58
A.L.E: Arcade Learning Environment (version +a54a328)
[Powered by Stella]
/opt/conda/lib/python3.8/site-packages/ale_py/roms/__init__.py:44: UserWarning: ale_py.roms contains unsupported ROMs: /opt/conda/lib/python3.8/site-packages/AutoROM/roms/{joust.bin, warlords.bin, maze_craze.bin, combat.bin}
warnings.warn(
[06-28 08:25:46] INFO Evaluation: Train Iter(0) Env Step(0) Eval Reward(-21.000) func.py:58
[06-28 08:25:46] WARNING You have not installed memcache package! DI-engine has changed to some alternatives.

$ kubectl logs job-with-tasks-learner-0
/opt/conda/lib/python3.8/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /opt/conda/conda-bld/pytorch_1607370172916/work/c10/cuda/CUDAFunctions.cpp:100.)
return torch._C._cuda_getDeviceCount() > 0
[06-28 08:25:27] INFO Learner running on node 0
```
## User Guide
Expand Down
2 changes: 1 addition & 1 deletion chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
version: 1.0.0
version: 1.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application.
Expand Down
2 changes: 0 additions & 2 deletions chart/templates/config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
apiVersion: v1
data:
DI_JOB_DEFAULT_RESOURCES: '{"resources": {"requests": {"cpu": 1, "memory": "2Gi"}}}'
DI_ORCHESTRATOR_VERSION: {{ .Values.tag }}
DI_SERVER_URL: http://{{ .Values.serverName }}.{{ .Release.Namespace }}.{{ .Values.serviceDomainName }}:{{ .Values.serverPort }}
kind: ConfigMap
metadata:
name: di-config
Expand Down
10 changes: 9 additions & 1 deletion chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ spec:
- --probe-addr=:8080
- --metric-addr=:8443
- --leader-elect
- --qps={{ .Values.qps }}
- --burst={{ .Values.burst }}
- --service-domain-name={{ .Values.serviceDomainName }}
- --di-server-url=http://{{ .Values.serverName }}.{{ .Release.Namespace }}.{{ .Values.serviceDomainName }}:{{ .Values.serverPort }}
command:
- /di-orchestrator
- operator
Expand Down Expand Up @@ -75,7 +79,11 @@ spec:
containers:
- args:
- --zap-devel=true
- --server-bind-address=:8081
- --server-bind-address=:{{ .Values.serverPort }}
- --qps={{ .Values.qps }}
- --burst={{ .Values.burst }}
- --service-domain-name={{ .Values.serviceDomainName }}
- --di-server-url=http://{{ .Values.serverName }}.{{ .Release.Namespace }}.{{ .Values.serviceDomainName }}:{{ .Values.serverPort }}
command:
- /di-orchestrator
- server
Expand Down
2 changes: 2 additions & 0 deletions chart/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ spec:
apiVersion: v1
kind: Service
metadata:
labels:
control-plane: {{ .Values.serverName }}
name: {{ .Values.serverName }}
namespace: {{ .Release.Namespace }}
spec:
Expand Down
3 changes: 2 additions & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
# Declare variables to be passed into your templates.

# tag for pytorch-operator image
tag: ${ImageTag}
tag: v1.1.0

# tag for di-orchestrator image
registry: opendilab
serviceDomainName: svc.cluster.local

Expand Down
16 changes: 14 additions & 2 deletions cmd/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,27 @@ import (
)

type GenericFlags struct {
ZapOpts *zap.Options
QPS float64
Burst int
ServiceDomainName string
DIServerURL string
ZapOpts *zap.Options
}

func NewGenericFlags() *GenericFlags {
return &GenericFlags{
ZapOpts: &zap.Options{},
QPS: 5,
Burst: 10,
ServiceDomainName: "svc.cluster.local",
DIServerURL: "http://di-server.di-system.svc.cluster.local:8081",
ZapOpts: &zap.Options{},
}
}

func (f *GenericFlags) AddFlags(cmd *cobra.Command) {
goflag.Float64Var(&f.QPS, "qps", f.QPS, "qps for k8s client")
goflag.IntVar(&f.Burst, "burst", f.Burst, "burst for k8s client")
goflag.StringVar(&f.ServiceDomainName, "service-domain-name", f.ServiceDomainName, "k8s service domain name")
goflag.StringVar(&f.DIServerURL, "di-server-url", f.DIServerURL, "url for accessing di server")
f.ZapOpts.BindFlags(goflag.CommandLine)
}
14 changes: 9 additions & 5 deletions cmd/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ limitations under the License.
package operator

import (
"context"
"flag"
"time"

Expand All @@ -32,6 +31,7 @@ import (
alloc "opendilab.org/di-orchestrator/pkg/allocator"
alloctypes "opendilab.org/di-orchestrator/pkg/allocator/types"
div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1"
dicommon "opendilab.org/di-orchestrator/pkg/common"
dicontext "opendilab.org/di-orchestrator/pkg/context"
"opendilab.org/di-orchestrator/pkg/controllers"
)
Expand Down Expand Up @@ -105,7 +105,13 @@ func runCommand(cmd *cobra.Command, options *CreateOptions) error {
logger := zap.New(zap.UseFlagOptions(options.GenericFlags.ZapOpts))
ctrl.SetLogger(logger)

// set common config
dicommon.SetServiceDomainName(options.ServiceDomainName)
dicommon.SetDIServerURL(options.DIServerURL)

config := ctrl.GetConfigOrDie()
config.QPS = float32(options.QPS)
config.Burst = options.Burst
mgr, err := ctrl.NewManager(config, ctrl.Options{
Scheme: scheme,
SyncPeriod: options.SyncPeriod,
Expand All @@ -119,8 +125,7 @@ func runCommand(cmd *cobra.Command, options *CreateOptions) error {
return err
}

ctx := dicontext.NewContext(context.Background(),
config,
ctx := dicontext.NewContext(config,
mgr.GetClient(),
mgr.GetEventRecorderFor("di-operator"),
ctrl.Log.WithName("di-operator"))
Expand All @@ -130,8 +135,7 @@ func runCommand(cmd *cobra.Command, options *CreateOptions) error {
return err
}

ctx = dicontext.NewContext(context.Background(),
config,
ctx = dicontext.NewContext(config,
mgr.GetClient(),
mgr.GetEventRecorderFor("di-allocator"),
ctrl.Log.WithName("di-allocator"))
Expand Down
18 changes: 12 additions & 6 deletions cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ limitations under the License.
package server

import (
"context"
"flag"

"github.com/spf13/cobra"
Expand All @@ -29,6 +28,7 @@ import (

cmdcommon "opendilab.org/di-orchestrator/cmd/common"
div2alpha1 "opendilab.org/di-orchestrator/pkg/api/v2alpha1"
dicommon "opendilab.org/di-orchestrator/pkg/common"
dicontext "opendilab.org/di-orchestrator/pkg/context"
"opendilab.org/di-orchestrator/pkg/server"
)
Expand Down Expand Up @@ -96,7 +96,13 @@ func runCommand(cmd *cobra.Command, options *CreateOptions) error {
logger := zap.New(zap.UseFlagOptions(options.GenericFlags.ZapOpts))
ctrl.SetLogger(logger)

// set common config
dicommon.SetServiceDomainName(options.ServiceDomainName)
dicommon.SetDIServerURL(options.DIServerURL)

config := ctrl.GetConfigOrDie()
config.QPS = float32(options.QPS)
config.Burst = options.Burst
mgr, err := ctrl.NewManager(config, ctrl.Options{
Scheme: scheme,
MetricsBindAddress: options.MetricAddress,
Expand All @@ -107,12 +113,12 @@ func runCommand(cmd *cobra.Command, options *CreateOptions) error {
return err
}

ctx := dicontext.NewContext(context.Background(),
config,
ctx := dicontext.NewContext(config,
mgr.GetClient(),
mgr.GetEventRecorderFor("di-operator"),
ctrl.Log.WithName("di-operator"))
diServer := server.NewDIServer(ctx, options.ServerBindAddress)
mgr.GetEventRecorderFor("di-server"),
ctrl.Log.WithName("di-server"))
processor := server.NewProcessor(ctx)
diServer := server.NewDIServer(ctx, processor, options.ServerBindAddress)
mgr.Add(diServer)

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand Down
Loading

0 comments on commit 02c5aae

Please sign in to comment.