Skip to content

Commit

Permalink
fix: Drop CloudCE proxy handling
Browse files Browse the repository at this point in the history
  • Loading branch information
sfayer committed Oct 10, 2024
1 parent fbfad32 commit 0e246f0
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 113 deletions.
121 changes: 15 additions & 106 deletions src/DIRAC/Resources/Computing/CloudComputingElement.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,20 @@
- File I/O: A small amount of input may be transferred through the
instance metadata, but after that the VM is inaccessible.
- Authentication: Most cloud endpoints use a password or API style credentials
rather than a grid style proxy based authentication. The pilot still requires
a suitable proxy, but this cannot be renewed via the cloud interface due to
the I/O limitations.
rather than a grid style proxy based authentication.
- Pilot (VM) Tidy-up: Cloud providers will not remove stopped instances by
default.
To avoid the proxy renewal limitations, an alternate pilot proxy is used within
the instances. This can either be a longer version of the usual pilot proxy or
a pilot proxy generated from another dedicated cert/user. The proxy contains
the DIRAC group, but no VOMS (as this would likely expire too quickly).
The cloud instances now use the standard pilot proxy bundled in the job wrapper
script. The extended lifetime proxy that was generated and included in the
cloud user_data is no longer required and has been removed.
By default it is assumed that a generic CentOS7 base image is being used. This
will be fully contextualised using cloud-init:
- CVMFS & Singularity will be installed.
- A dirac user will be created to run the jobs.
- Pilot proxy and start-up scripts will be installed in /mnt.
- Pilot start-up scripts will be installed in /mnt.
- The usual pilot script will be placed in the dirac home directory and
the start-up scripts are run (as the dirac user).
- After the pilot terminates, the machine is stopped by calling halt.
Expand Down Expand Up @@ -118,13 +115,6 @@
Note: It is highly recommended to use SingularityCE with a container
image with the required packages instead.
Context_ProxyLifetime:
(Optional) When submitting an instance, it will be provisioned with a new
proxy with the same properties as the one provided by the SiteDirector but
with an extended lifetime. This option sets the lifetime of the new proxy
in seconds: It must be greater than the maximum time jobs can run for in
the instance. Defaults to two weeks.
Context_MaxLifetime:
(Optional) The maximum lifetime of an instance in seconds. Any instances
older than this will be removed regardless of state. Defaults to two weeks.
Expand Down Expand Up @@ -160,10 +150,6 @@

from DIRAC import S_OK, S_ERROR, gConfig, rootPath
from DIRAC.Resources.Computing.ComputingElement import ComputingElement
from DIRAC.Core.Security.ProxyInfo import getProxyInfo
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getVOForGroup
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
from DIRAC.FrameworkSystem.Client.ProxyManagerClient import ProxyManagerClient
from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager

# Standard CE name
Expand All @@ -176,9 +162,6 @@
OPT_PROVIDER = "CloudType"
OPT_AUTHFILE = "CloudAuth"
DEF_AUTHFILE = os.path.join(rootPath, "etc/cloud.auth")
# default proxy lifetime (2 weeks)
DEF_PROXYLIFETIME = 1209600
DEF_PROXYGRACE = 86400
# default max instance lifetime in seconds
# all instances older than this will be removed
DEF_MAXLIFETIME = 1209600
Expand Down Expand Up @@ -217,13 +200,17 @@ def _getDriverAuth(self):
except KeyError:
raise RuntimeError(f"Invalid auth config for host {self.ceName}")
# If the secret is set to the magic string "PROXY"
# we instead return a path to a grid proxy file
# we instead return the proxy string
if secret == "PROXY":
if self._origProxy:
secret = self._origProxy
secret = ""
if self.proxy:
result = self.proxy.dumpAllToString()
if result["OK"]:
secret = result["Value"]
else:
self.log.warn(f"Unable to convert proxy for {self.ceName}:", result["Message"])
else:
self.log.warn(f"Proxy for {self.ceName} not set!")
secret = ""
return (key, secret)

def _getDriver(self, refresh=False):
Expand Down Expand Up @@ -354,9 +341,7 @@ def _getMetadata(self, executableFile, pilotStamp=""):
with open(template_file) as template_fd:
template = yaml.safe_load(template_fd)
for filedef in template["write_files"]:
if filedef["content"] == "PROXY_STR":
filedef["content"] = self.proxy
elif filedef["content"] == "EXECUTABLE_STR":
if filedef["content"] == "EXECUTABLE_STR":
filedef["content"] = exe_str
elif "STAMP_STR" in filedef["content"]:
filedef["content"] = filedef["content"].replace("STAMP_STR", pilotStamp)
Expand All @@ -375,37 +360,6 @@ def _getMetadata(self, executableFile, pilotStamp=""):
userData.attach(mimeText)
return str(userData)

def _renewCloudProxy(self):
"""Takes short lived proxy from the site director and
promotes it to a long lived proxy keeping the DIRAC group.
:returns: True on success, false otherwise.
:rtype: bool
"""
if not self._cloudDN or not self._cloudGroup:
self.log.error("Could not renew cloud proxy, DN and/or Group not set.")
return False

proxyLifetime = int(self.ceParameters.get("Context_ProxyLifetime", DEF_PROXYLIFETIME))
# only renew proxy if lifetime is less than configured lifetime
# self.valid is a datetime
if self.valid - datetime.datetime.utcnow() > proxyLifetime * datetime.timedelta(seconds=1):
return True
proxyLifetime += DEF_PROXYGRACE
proxyManager = ProxyManagerClient()
self.log.info(f"Downloading proxy with cloudDN and cloudGroup: {self._cloudDN}, {self._cloudGroup}")
res = proxyManager.downloadProxy(self._cloudDN, self._cloudGroup, limited=True, requiredTimeLeft=proxyLifetime)
if not res["OK"]:
self.log.error("Could not download proxy", res["Message"])
return False
resdump = res["Value"].dumpAllToString()
if not resdump["OK"]:
self.log.error("Failed to dump proxy to string", resdump["Message"])
return False
self.proxy = resdump["Value"]
self.valid = datetime.datetime.utcnow() + proxyLifetime * datetime.timedelta(seconds=1)
return True

def __init__(self, *args, **kwargs):
"""Constructor
Takes the standard CE parameters.
Expand All @@ -417,60 +371,15 @@ def __init__(self, *args, **kwargs):
# proxy expiry time (in date time)
self.valid = datetime.datetime.utcnow()
self._cloudDriver = None
self._cloudDN = None
self._cloudGroup = None
self._origProxy = None

def setProxy(self, proxy, valid=0):
"""Take existing proxy, and extract group name.
Then create new proxy for the cloud pilot user
bound to the same group with the lifetime set to
the value specified in the CE config.
:return: S_OK() or S_ERROR(error string)
"""
# Store original proxy for FedCloud submission/auth
# We write this to a file as that's the format we need
ret = gProxyManager.dumpProxyToFile(proxy)
if not ret["OK"]:
self.log.error("Failed to write proxy file", f"for {self.ceName}: {ret['Message']}")
self._origProxy = ret["Value"]
# For a driver refresh to reload the proxy
self._getDriver(refresh=True)
# we deliberately log extra errors here,
# as the return value is not always checked
res = getProxyInfo(proxy, disableVOMS=True)
if not res["OK"]:
self.log.error("getProxyInfo failed", res["Message"])
return S_ERROR(f"getProxyInfo did not return OK: {str(res)}")
info = res["Value"]
if not "group" in info:
self.log.error("No group found in proxy")
return S_ERROR("No group found in proxy")
if not "identity" in info:
self.log.error("No user DN (identity) found in proxy")
return S_ERROR("No user DN (identity) found in proxy")
pilotGroup = info["group"]
pilotDN = info["identity"]
opsHelper = Operations(group=pilotGroup)
self._cloudDN = opsHelper.getValue("Pilot/GenericCloudDN", pilotDN)
self._cloudGroup = pilotGroup
if not self._renewCloudProxy():
self.log.error("Failed to renew proxy.")
return S_ERROR("Failed to renew proxy.")
return S_OK()

def submitJob(self, executableFile, proxy, numberOfJobs=1):
"""Creates VM instances
:param str executableFile: Path to pilot job wrapper file to use
:param str proxy: Unused, see setProxy()
:param str proxy: Unused, requires bundled proxy in executableFile instead
:param int numberOfJobs: Number of instances to start
:return: S_OK/S_ERROR
"""
if not self._renewCloudProxy():
return S_ERROR("Failed to renew proxy during job submission.")

instIDs = []

# these parameters are identical for each job
Expand Down
7 changes: 0 additions & 7 deletions src/DIRAC/Resources/Computing/cloudinit.template
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ write_files:
CVMFS_CACHE_BASE=/mnt/cvmfs
CVMFS_HTTP_PROXY="auto;DIRECT"
CVMFS_PAC_URLS="http://grid-wpad/wpad.dat;http://wpad/wpad.dat;http://cernvm-wpad.cern.ch/wpad.dat;http://cernvm-wpad.fnal.gov/wpad.dat"
- path: /root/proxy.pem
permissions: '0600'
content: PROXY_STR
- path: /root/run_pilot.sh
permissions: '0755'
content: EXECUTABLE_STR
Expand All @@ -53,7 +50,6 @@ write_files:
content: |
#!/bin/bash
cd /mnt/dirac
export X509_USER_PROXY=/mnt/proxy.pem
export PILOT_UUID="cloud://$(cat /var/lib/cloud/data/instance-id)"
export DIRAC_PILOT_STAMP="STAMP_STR"
bash /mnt/run_pilot.sh &> /mnt/dirac/startup.log
Expand Down Expand Up @@ -84,9 +80,6 @@ runcmd:
- [ useradd, -m, -d, /mnt/dirac, dirac ]
- [ passwd, -l, dirac ]
- [ mkdir, -p, /mnt/dirac/etc/grid-security ]
- [ cp, /root/proxy.pem, /mnt/proxy.pem ]
- [ chmod, 600, /mnt/proxy.pem ]
- [ chown, "dirac:dirac", /mnt/proxy.pem ]
- [ chown, -R, "dirac:dirac", /mnt/dirac/etc ]
# Start caching certificates directory from CVMFS
# Workaround for when local Stratum-1 is slow
Expand Down

0 comments on commit 0e246f0

Please sign in to comment.