knix-microfunctions · ksatzke · Jul 10, 2020 · Jul 10, 2020 · Jul 23, 2020 · Jul 23, 2020
diff --git a/ManagementService/management_init.py b/ManagementService/management_init.py
@@ -355,7 +355,7 @@ def printUsage():
         sys.path.append(workflowdir)
         if os.getenv("KUBERNETES_PORT", None) != None:
             import deployWorkflow
-            url, endpoint_key = deployWorkflow.create_k8s_deployment(email, workflow_info, "Python", management=True)
+            url, endpoint_key = deployWorkflow.create_k8s_deployment(email, workflow_info, "Python", 0, management=True)
             DLCLIENT_MANAGEMENT.putMapEntry("Management_workflow_endpoint_map", endpoint_key, url)
             # Kubernetes mode only has one url
             endpoint_list = [url]

diff --git a/ManagementService/python/addWorkflow.py b/ManagementService/python/addWorkflow.py
@@ -27,6 +27,7 @@ def handle(value, sapi):
     success = False
 
     email = data["email"]
+
 
     if "workflow" in data:
         workflow = data["workflow"]
@@ -38,9 +39,14 @@ def handle(value, sapi):
         wf["status"] = "undeployed"
         wf["modified"] = time.time()
         wf["endpoints"] = []
+        #wf["gpu_usage"] = None
+        if "gpu_usage" in workflow:
+            wf["gpu_usage"] = str(workflow["gpu_usage"])
 
         wf["id"] = hashlib.md5(str(uuid.uuid4()).encode()).hexdigest()
 
+        #wf["on_gpu"] = True # add metadata on GPU requirements for this workflow. ToDo: make this configurable via GUI
+
         sapi.put(email + "_workflow_" + wf["id"], json.dumps(wf), True, True)
         #sapi.put(email + "_workflow_json_" + wf["id"], "", True, True)
         #sapi.put(email + "_workflow_requirements_" + wf["id"], "", True, True)

diff --git a/ManagementService/python/deployWorkflow.py b/ManagementService/python/deployWorkflow.py
@@ -26,6 +26,23 @@
 WF_TYPE_SAND = 0
 WF_TYPE_ASL = 1
 
+def get_kv_pairs(testdict, keys, dicts=None):
+    # find and return kv pairs with particular keys in testdict
+    if not dicts:
+        dicts = [testdict]
+        testdict = [testdict]  
+    data = testdict.pop(0)
+    if isinstance(data, dict):
+        data = data.values()
+    for d in data:        
+        if isinstance(d, dict) or isinstance(d, list): # check d for type        
+            testdict.append(d)
+            if isinstance(d, dict):
+                dicts.append(d)
+    if testdict: # no more data to search
+        return get_kv_pairs(testdict, keys, dicts)
+    return [(k, v) for d in dicts for k, v in d.items() if k in keys]
+
 def is_asl_workflow(wfobj):
     return 'StartAt' in wfobj and 'States' in wfobj and isinstance(wfobj['States'], dict)
 
@@ -202,7 +219,8 @@ def start_docker_sandbox(host_to_deploy, uid, sid, wid, wname, sandbox_image_nam
         try:
             print("Starting sandbox docker container for: " + uid + " " + sid + " " + wid + " " + sandbox_image_name)
             print("Docker daemon: " + "tcp://" + host_to_deploy[1] + ":2375" + ", environment variables: " + str(env_vars))
-            client.containers.run(sandbox_image_name, init=True, detach=True, ports={"8080/tcp": None}, ulimits=ulimit_list, auto_remove=True, name=sid, environment=env_vars, extra_hosts={host_to_deploy[0]:host_to_deploy[1]}, log_config=lc)
+            client.containers.run(sandbox_image_name, init=True, detach=True, ports={"8080/tcp": None}, ulimits=ulimit_list, auto_remove=True, name=sid, environment=env_vars, extra_hosts={host_to_deploy[0]:host_to_deploy[1]}, log_config=lc, runtime="nvidia")
+            #client.containers.run(sandbox_image_name, init=True, detach=True, ports={"8080/tcp": None}, ulimits=ulimit_list, auto_remove=True, name=sid, environment=env_vars, extra_hosts={host_to_deploy[0]:host_to_deploy[1]}, log_config=lc)
             # TEST/DEVELOPMENT: no auto_remove to access sandbox logs
             #client.containers.run(sandbox_image_name, init=True, detach=True, ports={"8080/tcp": None}, ulimits=ulimit_list, name=sid, environment=env_vars, extra_hosts={host_to_deploy[0]:host_to_deploy[1]}, log_config=lc)
         except Exception as exc:
@@ -241,7 +259,7 @@ def get_workflow_host_port(host_to_deploy, sid):
 
     return success, host_port
 
-def create_k8s_deployment(email, workflow_info, runtime, management=False):
+def create_k8s_deployment(email, workflow_info, runtime, gpu_usage, management=False):
     # KUBERNETES MODE
     new_workflow_conf = {}
     conf_file = '/opt/mfn/SandboxAgent/conf/new_workflow.conf'
@@ -258,7 +276,11 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False):
         raise Exception("Unable to load "+ksvc_file+". Ensure that the configmap has been setup properly", e)
 
     # Kubernetes labels cannot contain @ or _ and should start and end with alphanumeric characters
-    wfNameSanitized = 'wf-' + workflow_info["workflowId"].replace('@', '-').replace('_', '-').lower() + '-wf'
+    wfNameSanitized = 'wf-' + workflow_info["workflowId"].replace('@', '-').replace('_', '-').replace('/','-').lower() + '-wf'
+    #wfActualNameSanitized = 'wf-' + workflow_info["workflowName"].replace('@', '-').replace('_', '-').replace('/','-').lower() + '-wf'
+    if len(wfNameSanitized) > 63:
+       print("Error creating kubernetes deployment for "+email+" "+workflow_info["workflowId"] + ", workflow name too long")
+
     emailSanitized = 'u-' + email.replace('@', '-').replace('_', '-').lower() + '-u'
     # Pod, Deployment and Hpa names for the new workflow will have a prefix containing the workflow name and user name
     app_fullname_prefix = ''
@@ -291,11 +313,40 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False):
     env.append({'name': 'WORKFLOWID', 'value': workflow_info["workflowId"]})
     env.append({'name': 'WORKFLOWNAME', 'value': workflow_info["workflowName"]})
 
-    # Special handling for the management container
+    # apply gpu_usage fraction to k8s deployment configuration
+    print("GPU sage in create_k8s_service: "+ str(gpu_usage))
+    use_gpus = gpu_usage
+
+    if runtime=="Java": # non gpu python function
+        # overwrite values from values.yaml for new workflows
+        #kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = str(use_gpus)
+        #kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = str(use_gpus)
+        kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox_java" 
+
+    if not management and use_gpus == 0. and runtime=="Python": # non gpu python function
+        # overwrite values from values.yaml for new workflows
+        kservice['spec']['template']['spec']['containers'][0]['resources']['limits'].pop('nvidia.com/gpu', None) # ['nvidia.com/gpu'] = str(use_gpus)
+        kservice['spec']['template']['spec']['containers'][0]['resources']['requests'].pop('nvidia.com/gpu', None) # ['nvidia.com/gpu'] = str(use_gpus)
+        kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox" 
+
+    if not management and use_gpus > 0. and runtime=="Python": # gpu using python function
+        # overwrite values from values.yaml for new workflows
+        kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = str(use_gpus)
+        kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = str(use_gpus)
+        kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox_gpu" 
+
+    # Special handling for the management container: never run on gpu
     if management:
         kservice['spec']['template']['spec']['volumes'] = [{ 'name': 'new-workflow-conf', 'configMap': {'name': new_workflow_conf['configmap']}}]
         kservice['spec']['template']['spec']['containers'][0]['volumeMounts'] = [{'name': 'new-workflow-conf', 'mountPath': '/opt/mfn/SandboxAgent/conf'}]
         kservice['spec']['template']['spec']['serviceAccountName'] = new_workflow_conf['mgmtserviceaccount']
+
+        # management container should not consume a CPU and use standard sandbox image
+        if (labels['workflowid'] == "Management"):
+            kservice['spec']['template']['spec']['containers'][0]['resources']['limits']['nvidia.com/gpu'] = "0"
+            kservice['spec']['template']['spec']['containers'][0]['resources']['requests']['nvidia.com/gpu'] = "0"
+            kservice['spec']['template']['spec']['containers'][0]['image'] = "localhost:5000/microfn/sandbox"  
+
         if 'HTTP_GATEWAYPORT' in new_workflow_conf:
             env.append({'name': 'HTTP_GATEWAYPORT', 'value': new_workflow_conf['HTTP_GATEWAYPORT']})
         if 'HTTPS_GATEWAYPORT' in new_workflow_conf:
@@ -325,6 +376,7 @@ def create_k8s_deployment(email, workflow_info, runtime, management=False):
             print("ERROR deleting existing kservice")
             print(resp.text)
 
+    # no change for Java function
     print('Creating new kservice')
     resp = requests.post(
         "https://kubernetes.default:"+os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")+"/apis/serving.knative.dev/v1/namespaces/"+namespace+"/services",
@@ -385,6 +437,8 @@ def handle(value, sapi):
             raise Exception("malformed input")
         sapi.log(json.dumps(workflow))
         wfmeta = sapi.get(email + "_workflow_" + workflow["id"], True)
+        print("WFMETA in deployWorkflow: "+ str(wfmeta))
+
         if wfmeta is None or wfmeta == "":
             raise Exception("workflow metadata is not valid.")
         try:
@@ -413,6 +467,8 @@ def handle(value, sapi):
         if is_asl_workflow(wfobj):
             wf_type = WF_TYPE_ASL
 
+        #use_gpus = int(wfmeta._gpu_usage)
+
         success, errmsg, resource_names, uploaded_resources = check_workflow_functions(wf_type, wfobj, email, sapi)
         if not success:
             raise Exception("Couldn't deploy workflow; " + errmsg)
@@ -445,6 +501,14 @@ def handle(value, sapi):
         #dlc.put("deployment_info_workflow_" + workflow["id"], json.dumps(deployment_info))
         # _XXX_: important!
         # put must not be queued as the function currently waits for the container to become ready
+
+        if "gpu_usage" in wfmeta and wfmeta["gpu_usage"] != "None":
+            gpu_usage = float(wfmeta["gpu_usage"])
+        else:
+            gpu_usage = 0.
+
+        print("deduced gpu_usage: " + str(gpu_usage))
+
         sapi.put("deployment_info_workflow_" + workflow["id"], json.dumps(deployment_info), True, False)
 
         status = "deploying"
@@ -454,7 +518,8 @@ def handle(value, sapi):
                 runtime = "Java"
             else:
                 runtime = "Python"
-            url, endpoint_key = create_k8s_deployment(email, workflow_info, runtime)
+
+            url, endpoint_key = create_k8s_deployment(email, workflow_info, runtime, gpu_usage)
             if url is not None and len(url) > 0:
                 status = "deploying"
                 sapi.addSetEntry(workflow_info["workflowId"] + "_workflow_endpoints", str(url), is_private=True)
@@ -467,7 +532,12 @@ def handle(value, sapi):
         else:
             # We're running BARE METAL mode
             # _XXX_: due to the queue service still being in java in the sandbox
-            sandbox_image_name = "microfn/sandbox"
+
+            if gpu_usage == 0:
+                sandbox_image_name = "microfn/sandbox" # default value
+            elif gpu_usage != 0 and runtime == "Python":
+                sandbox_image_name = "microfn/sandbox_gpu" # sandbox uses GPU
+
             if any(resource_info_map[res_name]["runtime"] == "Java" for res_name in resource_info_map):
                 sandbox_image_name = "microfn/sandbox_java"
 
@@ -477,8 +547,25 @@ def handle(value, sapi):
             if hosts is not None and hosts != "":
                 hosts = json.loads(hosts)
                 deployed_hosts = {}
-                # instruct hosts to start the sandbox and deploy workflow
+                gpu_hosts = {}
+                picked_hosts = {}
+
                 for hostname in hosts:
+                    #if hostname.endswith("_gpu"):
+                    if "has_gpu" in hosts[hostname]:
+                        hostip = hosts[hostname]
+                        gpu_hosts[hostname] = hostip
+
+                # instruct hosts to start the sandbox and deploy workflow
+                if runtime=="Java" or sandbox_image_name == "microfn/sandbox": # can use any host 
+                    picked_hosts = hosts
+                elif len(gpu_hosts) > 0:
+                    picked_hosts = gpu_hosts 
+                else:
+                    picked_hosts = hosts # fallback as there are no gpu hosts available
+                    print("available GPU hosts is empty. Deploying on general purpose host")
+
+                for hostname in picked_hosts: # loop over all hosts, need to pich gpu hosts for python/gpu workflows
                     hostip = hosts[hostname]
                     host_to_deploy = (hostname, hostip)
                     success, endpoint_key = start_docker_sandbox(host_to_deploy, email, workflow_info["sandboxId"], workflow_info["workflowId"], workflow_info["workflowName"], sandbox_image_name)

diff --git a/Sandbox/Dockerfile_gpu b/Sandbox/Dockerfile_gpu
@@ -0,0 +1,75 @@
+#   Copyright 2020 The KNIX Authors
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+#FROM ubuntu:18.04
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+
+# Install (as root)
+# Base
+RUN apt-get update --fix-missing
+RUN apt-get -y --no-install-recommends install build-essential
+RUN apt-get -y --no-install-recommends install netbase unzip file libmagic1
+
+# CUDA 10.1 dependencies and tools to build dlib 
+RUN apt-get -y --no-install-recommends install libsm6 libxrender1 libxrender-dev libxext6 libglib2.0-0 git cmake
+RUN apt-get install -y --no-install-recommends libnvinfer6=6.0.1-1+cuda10.1 libnvinfer-dev=6.0.1-1+cuda10.1 libnvinfer-plugin6=6.0.1-1+cuda10.1
+
+# Python
+RUN apt-get -y --no-install-recommends install python3 python3-dev
+RUN apt-get -y --no-install-recommends install python3-pip
+RUN apt-get -y --no-install-recommends install zlib1g libssl1.0 libsasl2-2 ca-certificates
+
+RUN /usr/bin/python3 -m pip install --upgrade pip
+
+RUN /usr/bin/python3 -m pip install setuptools
+RUN /usr/bin/python3 -m pip install thrift>=0.12.0
+RUN /usr/bin/python3 -m pip install anytree
+RUN /usr/bin/python3 -m pip install ujsonpath
+RUN /usr/bin/python3 -m pip install requests
+RUN /usr/bin/python3 -m pip install retry
+# remove warnings from anytree package
+RUN /usr/bin/python3 -m pip install fastcache
+# Needed for multi-language support (currently just Java)
+RUN /usr/bin/python3 -m pip install thriftpy2
+
+# Install dlib for CUDA
+RUN git clone https://github.com/davisking/dlib.git
+RUN mkdir -p /dlib/build
+
+RUN cmake -H/dlib -B/dlib/build -DDLIB_USE_CUDA=1 -DUSE_AVX_INSTRUCTIONS=1
+RUN cmake --build /dlib/build
+
+RUN cd /dlib; python3 /dlib/setup.py install
+
+# Install the face recognition package and tensorflow
+RUN pip3 install face_recognition
+RUN pip3 install tensorflow==2.1.0
+
+# Java (for queue service)
+RUN apt-get -y --no-install-recommends install openjdk-8-jdk-headless
+
+# Add components (as mfn)
+RUN groupadd -o -g 1000 -r mfn && useradd -d /opt/mfn -u 1000 -m -r -g mfn mfn
+RUN mkdir /opt/mfn/logs
+
+COPY build/queueservice.jar /opt/mfn/
+ADD frontend/frontend /opt/mfn/frontend
+ADD build/SandboxAgent.tar.gz /opt/mfn/
+ADD build/FunctionWorker.tar.gz /opt/mfn/
+ADD build/LoggingService.tar.gz /opt/mfn/
+
+RUN chown mfn:mfn -R /opt/mfn
+USER mfn
+WORKDIR /opt/mfn
+CMD ["python3", "/opt/mfn/SandboxAgent/sandboxagent.py"]
diff --git a/Sandbox/Dockerfile_java_gpu b/Sandbox/Dockerfile_java_gpu
@@ -0,0 +1,66 @@
+#   Copyright 2020 The KNIX Authors
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+#FROM ubuntu:18.04
+FROM nvidia/cuda:10.1-cudnn-devel-ubuntu18.04
+
+# Install (as root)
+# Base
+RUN apt-get update --fix-missing
+RUN apt-get -y --no-install-recommends install build-essential
+RUN apt-get -y --no-install-recommends install netbase unzip file libmagic1
+
+# Python
+RUN apt-get -y --no-install-recommends install python3 python3-dev
+RUN apt-get -y --no-install-recommends install python3-pip
+RUN apt-get -y --no-install-recommends install zlib1g libssl1.0 libsasl2-2 ca-certificates
+
+RUN /usr/bin/python3 -m pip install --upgrade pip
+
+RUN /usr/bin/python3 -m pip install setuptools
+RUN /usr/bin/python3 -m pip install thrift>=0.12.0
+RUN /usr/bin/python3 -m pip install anytree
+RUN /usr/bin/python3 -m pip install ujsonpath
+RUN /usr/bin/python3 -m pip install requests
+RUN /usr/bin/python3 -m pip install retry
+# remove warnings from anytree package
+RUN /usr/bin/python3 -m pip install fastcache
+# Needed for multi-language support (currently just Java)
+RUN /usr/bin/python3 -m pip install thriftpy2
+
+# Java
+RUN apt-get -y --no-install-recommends install openjdk-8-jdk-headless
+
+RUN apt-get -y --no-install-recommends install maven
+
+# Add components (as mfn)
+RUN groupadd -o -g 1000 -r mfn && useradd -d /opt/mfn -u 1000 -m -r -g mfn mfn
+RUN mkdir /opt/mfn/logs
+
+COPY build/queueservice.jar /opt/mfn/
+ADD frontend/frontend /opt/mfn/frontend
+ADD build/SandboxAgent.tar.gz /opt/mfn/
+ADD build/FunctionWorker.tar.gz /opt/mfn/
+ADD build/LoggingService.tar.gz /opt/mfn/
+
+ADD build/JavaRequestHandler.tar.gz /opt/mfn/
+
+RUN chmod +x /opt/mfn/JavaRequestHandler/setup_maven.sh
+RUN /opt/mfn/JavaRequestHandler/./setup_maven.sh True
+RUN mvn -Duser.home=/tmp -DskipTests -gs /opt/mfn/JavaRequestHandler/maven/sandbox-mvn-settings.xml -f /opt/mfn/JavaRequestHandler/maven/init-mvn.pom.xml dependency:resolve-plugins
+
+RUN chown mfn:mfn -R /opt/mfn
+USER mfn
+WORKDIR /opt/mfn
+CMD ["python3", "/opt/mfn/SandboxAgent/sandboxagent.py"]
diff --git a/Sandbox/Makefile b/Sandbox/Makefile
@@ -20,6 +20,7 @@ include ../build_env.mk
 
 default: build_thrift \
 	image \
+	image_gpu \
 	image_java
 
 clean:
@@ -95,6 +96,16 @@ image: \
 	build/SandboxAgent.tar.gz
 	$(call build_image,Dockerfile,microfn/sandbox)
 
+image_gpu: \
+	Dockerfile_gpu \
+	build/queueservice.jar \
+	frontend/frontend \
+	build/LoggingService.tar.gz \
+	build/FunctionWorker.tar.gz \
+	build/SandboxAgent.tar.gz
+	$(call build_image,Dockerfile_gpu,microfn/sandbox_gpu)
+
+
 image_java: \
 	Dockerfile_java \
 	build/queueservice.jar \
@@ -107,6 +118,7 @@ image_java: \
 
 push: image image_java
 	$(call push_image,microfn/sandbox)
+	$(call push_image,microfn/sandbox_gpu)
 	$(call push_image,microfn/sandbox_java)