This repository has been archived by the owner on May 4, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathgcloud.sh
151 lines (122 loc) · 5.83 KB
/
gcloud.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/bin/bash
echo "=============================================================================="
echo "https://mlbench.github.io/2018/09/04/tutorial/"
echo ""
echo "Assume the project, zone, region has been set"
echo " gcloud config set project [Project Name]"
echo " gcloud config set container/cluster [Cluster Name]"
echo " gcloud config set compute/zone [Zone]"
echo " gcloud config set compute/region [region]"
echo " gcloud container clusters get-credentials [cluster name] --zone [Zone] --project [project name]"
echo ""
echo "=============================================================================="
CLUSTER_NAME=${CLUSTER_NAME:-benchmark-settings}
MACHINE_TYPE=${MACHINE_TYPE:-n1-standard-4}
NUM_NODES=${NUM_NODES:-2}
NUM_GPU_PER_NODE=${NUM_GPU_PER_NODE:-1}
RELEASE_NAME=${RELEASE_NAME:-rel}
PREEMPTIBLE=${PREEMPTIBLE:-}
MYVALUES_FILE=${MYVALUES_FILE:-values.yaml}
CLUSTER_VERSION=1.10
INSTANCE_DISK_SIZE=50
MACHINE_ZONE=${MACHINE_ZONE:-europe-west1-b}
DISK_TYPE=pd-standard
GPU_TYPE=nvidia-tesla-k80
case $1 in
create )
# Create a CPU cluster
gcloud container clusters create ${CLUSTER_NAME} \
${PREEMPTIBLE} \
--zone ${MACHINE_ZONE} \
--cluster-version ${CLUSTER_VERSION} \
--enable-network-policy \
--machine-type=${MACHINE_TYPE} \
--num-nodes=${NUM_NODES} \
--disk-type=${DISK_TYPE} \
--disk-size=${INSTANCE_DISK_SIZE} \
--scopes=storage-full
# Get credential of the cluster
gcloud container clusters get-credentials --zone ${MACHINE_ZONE} ${CLUSTER_NAME}
# Create a tiller and add it to cluster-admin
kubectl --namespace kube-system create sa tiller
kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller
# Initialize helm to install charts
helm init --wait --service-account tiller
;;
create-gpu )
# Create a GPU cluster
gcloud container clusters create ${CLUSTER_NAME} \
${PREEMPTIBLE} \
--zone ${MACHINE_ZONE} \
--cluster-version ${CLUSTER_VERSION} \
--enable-network-policy \
--machine-type=${MACHINE_TYPE} \
--num-nodes=${NUM_NODES} \
--disk-type=${DISK_TYPE} \
--disk-size=${INSTANCE_DISK_SIZE} \
--accelerator type=${GPU_TYPE},count=${NUM_GPU_PER_NODE} \
--scopes=storage-full
# Get credential of the cluster
gcloud container clusters get-credentials --zone ${MACHINE_ZONE} ${CLUSTER_NAME}
# Install NVIDIA Driver for gpus
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/k8s-1.10/nvidia-driver-installer/cos/daemonset-preloaded.yaml
# Create a tiller and add it to cluster-admin
kubectl --namespace kube-system create sa tiller
kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller
# Initialize helm to install charts
helm init --wait --service-account tiller
;;
install )
echo "Installing helm charts..."
echo MYVALUES_FILE=$MYVALUES_FILE
helm upgrade --wait --recreate-pods -f ${MYVALUES_FILE} --timeout 900 --install ${RELEASE_NAME} charts/mlbench \
--set limits.workers=${NUM_NODES} \
--set worker.replicaCount=${NUM_NODES}
export NODE_PORT=$(kubectl get --namespace default -o jsonpath="{.spec.ports[0].nodePort}" services ${RELEASE_NAME}-mlbench-master)
export NODE_IP=$(gcloud compute instances list|grep $(kubectl get nodes --namespace default -o jsonpath="{.items[0].status.addresses[0].address}") |awk '{print $5}')
echo http://$NODE_IP:$NODE_PORT
gcloud compute firewall-rules create --quiet ${CLUSTER_NAME} --allow tcp:$NODE_PORT,tcp:$NODE_PORT
;;
upgrade )
helm upgrade --wait --recreate-pods -f ${MYVALUES_FILE} --timeout 900 --install ${RELEASE_NAME} charts/mlbench \
--set limits.workers=${NUM_NODES} \
--set worker.replicaCount=${NUM_NODES}
;;
uninstall )
helm delete --purge ${RELEASE_NAME}
gcloud compute firewall-rules delete --quiet ${CLUSTER_NAME}
;;
cleanup )
gcloud compute firewall-rules delete --quiet ${CLUSTER_NAME}
gcloud container clusters delete --quiet --zone ${MACHINE_ZONE} ${CLUSTER_NAME}
;;
dashboard )
export NODE_PORT=$(kubectl get --namespace default -o jsonpath="{.spec.ports[0].nodePort}" services ${RELEASE_NAME}-mlbench-master)
export NODE_IP=$(gcloud compute instances list|grep $(kubectl get nodes --namespace default -o jsonpath="{.items[0].status.addresses[0].address}") |awk '{print $5}')
node=$(kubectl get pods -o wide | grep mlbench-master | awk '{print $7}')
echo "Node of master pod is " $node
externalip=$(kubectl get nodes $node -o wide | tail -n 1 | awk '{print $6}')
echo "External IP of the node is " $externalip
echo "Dashboard URL: " http://$externalip:$NODE_PORT
;;
status)
helm status ${RELEASE_NAME}
;;
get-credentials)
gcloud container clusters get-credentials --zone ${MACHINE_ZONE} ${CLUSTER_NAME}
;;
list )
echo "CLUSTER_NAME=${CLUSTER_NAME}"
echo "MACHINE_TYPE=${MACHINE_TYPE}"
echo "NUM_NODES=${NUM_NODES}"
echo "NUM_GPU_PER_NODE=${NUM_GPU_PER_NODE}"
echo "RELEASE_NAME=${RELEASE_NAME}"
echo "PREEMPTIBLE=${PREEMPTIBLE}"
echo "MYVALUES_FILE=${MYVALUES_FILE}"
echo "CLUSTER_VERSION=${CLUSTER_VERSION}"
echo "INSTANCE_DISK_SIZE=${INSTANCE_DISK_SIZE}"
echo "MACHINE_ZONE=${MACHINE_ZONE}"
echo "DISK_TYPE=${DISK_TYPE}"
echo "GPU_TYPE=${GPU_TYPE}"
;;
esac