Skip to content

Commit

Permalink
Merge pull request #18870 from zexi/automated-cherry-pick-of-#18780-u…
Browse files Browse the repository at this point in the history
…pstream-master

Automated cherry pick of #18780: feat(host): use IsolatedDeviceModels as GPU whitelist
  • Loading branch information
zexi authored Dec 4, 2023
2 parents af4f1b3 + a270a06 commit c646b3c
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 36 deletions.
1 change: 1 addition & 0 deletions pkg/cloudcommon/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ type HostCommonOptions struct {

EnableRemoteExecutor bool `help:"Enable remote executor" default:"false"`

EnableIsolatedDeviceWhitelist bool `help:"enable isolated device white list" default:"false"`
ExecutorConnectTimeoutSeconds int `help:"executor client connection timeout in seconds, default is 30" default:"30"`
ImageDeployDriver string `help:"Image deploy driver" default:"qemu-kvm" choices:"qemu-kvm|nbd|libguestfs"`
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/hostman/hostinfo/hostinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -2057,6 +2057,8 @@ func (h *SHostInfo) probeSyncIsolatedDevices() (*jsonutils.JSONArray, error) {
}
}

enableDevWhitelist := options.HostOptions.EnableIsolatedDeviceWhitelist

offloadNics, err := h.getNicsOvsOffloadInterfaces(options.HostOptions.OvsOffloadNics)
if err != nil {
return nil, err
Expand All @@ -2065,7 +2067,7 @@ func (h *SHostInfo) probeSyncIsolatedDevices() (*jsonutils.JSONArray, error) {
h.IsolatedDeviceMan.ProbePCIDevices(
options.HostOptions.DisableGPU, options.HostOptions.DisableUSB, options.HostOptions.DisableCustomDevice,
sriovNics, offloadNics, options.HostOptions.PTNVMEConfigs, options.HostOptions.AMDVgpuPFs, options.HostOptions.NVIDIAVgpuPFs,
)
enableDevWhitelist)

objs, err := h.getRemoteIsolatedDevices()
if err != nil {
Expand Down
36 changes: 32 additions & 4 deletions pkg/hostman/isolated_device/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,41 @@ const (
DEFAULT_CPU_CMD = "host,kvm=off"
)

func getPassthroughGPUs(filteredAddrs []string) ([]*PCIDevice, error, []error) {
func isInWhitelistModels(models []IsolatedDeviceModel, dev *PCIDevice) bool {
for _, model := range models {
if model.VendorId == dev.VendorId && model.DeviceId == dev.DeviceId {
return true
}
}
return false
}

func getPassthroughGPUs(filteredAddrs []string, enableWhitelist bool, whitelistModels []IsolatedDeviceModel) ([]*PCIDevice, error, []error) {
lines, err := getGPUPCIStr()
if err != nil {
return nil, err, nil
}

warns := make([]error, 0)
devs := []*PCIDevice{}
log.Infof("filter address %v", filteredAddrs)
log.Infof("filter address %v, enableWhiteList: %v", filteredAddrs, enableWhitelist)
for _, line := range lines {
if len(line) == 0 {
continue
}
dev := NewPCIDevice2(line)
if utils.IsInStringArray(dev.Addr, filteredAddrs) {
continue
}
if !utils.IsInArray(dev.ClassCode, GpuClassCodes) {
continue
}
if enableWhitelist {
if !isInWhitelistModels(whitelistModels, dev) {
log.Infof("skip add device %s cause of not in isolated_device_models", dev.String())
continue
}
}
if err := dev.checkSameIOMMUGroupDevice(); err != nil {
warns = append(warns, errors.Wrapf(err, "get dev %s iommu group devices", dev.Addr))
continue
Expand Down Expand Up @@ -143,6 +161,9 @@ type PCIDevice struct {
}

func NewPCIDevice(line string) (*PCIDevice, error) {
if len(line) == 0 {
return nil, errors.Errorf("input line is empty")
}
dev := NewPCIDevice2(line)
if err := dev.checkSameIOMMUGroupDevice(); err != nil {
return nil, err
Expand All @@ -156,7 +177,7 @@ func NewPCIDevice(line string) (*PCIDevice, error) {
func NewPCIDevice2(line string) *PCIDevice {
dev := parseLspci(line)
if err := dev.fillPCIEInfo(); err != nil {
log.Warningf("fillPCIEInfo for device: %s, error: %v", dev.String(), err)
log.Warningf("fillPCIEInfo for line: %q, device: %s, error: %v", line, dev.String(), err)
}
return dev
}
Expand Down Expand Up @@ -384,6 +405,9 @@ func (d *PCIDevice) bindDriver() error {
}

func (d *PCIDevice) fillPCIEInfo() error {
if d.Addr == "" {
return errors.Errorf("device address is empty: %s", d.String())
}
cmd := fmt.Sprintf("lspci -vvv -s %s", d.Addr)
lines, err := bashOutput(cmd)
if err != nil {
Expand Down Expand Up @@ -537,7 +561,11 @@ func detectPCIDevByAddrWithoutIOMMUGroup(addr string) (*PCIDevice, error) {
if err != nil {
return nil, err
}
return NewPCIDevice2(strings.Join(ret, "")), nil
line := strings.Join(ret, "")
if line == "" {
return nil, nil
}
return NewPCIDevice2(line), nil
}

func getDeviceCmd(dev IDevice, index int) string {
Expand Down
54 changes: 25 additions & 29 deletions pkg/hostman/isolated_device/isolated_device.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ type IsolatedDeviceManager interface {
GetDevices() []IDevice
GetDeviceByIdent(vendorDevId, addr, mdevId string) IDevice
GetDeviceByAddr(addr string) IDevice
ProbePCIDevices(skipGPUs, skipUSBs, skipCustomDevs bool, sriovNics, ovsOffloadNics []HostNic, nvmePciDisks, amdVgpuPFs, nvidiaVgpuPFs []string)
ProbePCIDevices(skipGPUs, skipUSBs, skipCustomDevs bool, sriovNics, ovsOffloadNics []HostNic, nvmePciDisks, amdVgpuPFs, nvidiaVgpuPFs []string, enableWhitelist bool)
StartDetachTask()
BatchCustomProbe()
AppendDetachedDevice(dev *CloudDeviceInfo)
Expand All @@ -137,7 +137,7 @@ func (man *isolatedDeviceManager) GetDevices() []IDevice {
return man.devices
}

func (man *isolatedDeviceManager) probeGPUS(skipGPUs bool, amdVgpuPFs, nvidiaVgpuPFs []string) {
func (man *isolatedDeviceManager) probeGPUS(skipGPUs bool, amdVgpuPFs, nvidiaVgpuPFs []string, enableWhitelist bool, whitelistModels []IsolatedDeviceModel) {
if skipGPUs {
return
}
Expand All @@ -148,10 +148,10 @@ func (man *isolatedDeviceManager) probeGPUS(skipGPUs bool, amdVgpuPFs, nvidiaVgp
filteredAddrs = append(filteredAddrs, man.devices[i].GetAddr())
}

gpus, err, warns := getPassthroughGPUs(filteredAddrs)
gpus, err, warns := getPassthroughGPUs(filteredAddrs, enableWhitelist, whitelistModels)
if err != nil {
// ignore getPassthroughGPUS error on old machines without VGA devices
log.Errorf("getPassthroughGPUS: %v", err)
log.Errorf("getPassthroughGPUS error: %v", err)
man.host.AppendError(fmt.Sprintf("get passhtrough gpus %s", err.Error()), "isolated_devices", "", " ")
} else {
if len(warns) > 0 {
Expand All @@ -166,26 +166,20 @@ func (man *isolatedDeviceManager) probeGPUS(skipGPUs bool, amdVgpuPFs, nvidiaVgp
}
}

func (man *isolatedDeviceManager) probeCustomPCIDevs(skipCustomDevs bool) {
func (man *isolatedDeviceManager) probeCustomPCIDevs(skipCustomDevs bool, devModels []IsolatedDeviceModel, filterClassCodes []string) {
if skipCustomDevs {
return
}
devModels, err := man.getCustomIsolatedDeviceModels()
if err != nil {
log.Errorf("get custom isolated device models %s", err.Error())
man.host.AppendError(fmt.Sprintf("get custom isolated device models %s", err.Error()), "isolated_devices", "", "")
} else {
for _, devModel := range devModels {
devs, err := getPassthroughPCIDevs(devModel)
if err != nil {
log.Errorf("getPassthroughPCIDevs %v: %s", devModel, err)
man.host.AppendError(fmt.Sprintf("get custom passthrough pci devices %s", err.Error()), "isolated_devices", "", "")
continue
}
for i, dev := range devs {
man.devices = append(man.devices, dev)
log.Infof("Add general pci device: %d => %#v", i, dev)
}
for _, devModel := range devModels {
devs, err := getPassthroughPCIDevs(devModel, filterClassCodes)
if err != nil {
log.Errorf("getPassthroughPCIDevs %v: %s", devModel, err)
man.host.AppendError(fmt.Sprintf("get custom passthrough pci devices %s", err.Error()), "isolated_devices", "", "")
continue
}
for i, dev := range devs {
man.devices = append(man.devices, dev)
log.Infof("Add general pci device: %d => %#v", i, dev)
}
}
}
Expand Down Expand Up @@ -306,19 +300,21 @@ func (man *isolatedDeviceManager) probeNVIDIAVgpus(nvidiaVgpuPFs []string) {
}
}

func (man *isolatedDeviceManager) ProbePCIDevices(
skipGPUs, skipUSBs, skipCustomDevs bool,
sriovNics, ovsOffloadNics []HostNic,
nvmePciDisks, amdVgpuPFs, nvidiaVgpuPFs []string,
) {
func (man *isolatedDeviceManager) ProbePCIDevices(skipGPUs, skipUSBs, skipCustomDevs bool, sriovNics, ovsOffloadNics []HostNic, nvmePciDisks, amdVgpuPFs, nvidiaVgpuPFs []string, enableWhitelist bool) {
man.devices = make([]IDevice, 0)
devModels, err := man.getCustomIsolatedDeviceModels()
if err != nil {
log.Errorf("get isolated device devModels %s", err.Error())
man.host.AppendError(fmt.Sprintf("get custom isolated device devModels %s", err.Error()), "isolated_devices", "", "")
return
}
man.probeUSBs(skipUSBs)
man.probeCustomPCIDevs(skipCustomDevs)
man.probeCustomPCIDevs(skipCustomDevs, devModels, GpuClassCodes)
man.probeSRIOVNics(sriovNics)
man.probeOffloadNICS(ovsOffloadNics)
man.probeAMDVgpus(amdVgpuPFs)
man.probeNVIDIAVgpus(nvidiaVgpuPFs)
man.probeGPUS(skipGPUs, amdVgpuPFs, nvidiaVgpuPFs)
man.probeGPUS(skipGPUs, amdVgpuPFs, nvidiaVgpuPFs, enableWhitelist, devModels)
}

type IsolatedDeviceModel struct {
Expand All @@ -335,7 +331,7 @@ func (man *isolatedDeviceManager) getCustomIsolatedDeviceModels() ([]IsolatedDev
params.Set("scope", jsonutils.NewString("system"))
res, err := modules.IsolatedDeviceModels.List(man.getSession(), jsonutils.NewDict())
if err != nil {
return nil, err
return nil, errors.Wrap(err, "list isolated_device_models from compute service")
}
devModels := make([]IsolatedDeviceModel, len(res.Data))
for i, obj := range res.Data {
Expand Down
16 changes: 14 additions & 2 deletions pkg/hostman/isolated_device/pci_device.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ package isolated_device
import (
"fmt"
"strings"

"yunion.io/x/jsonutils"
"yunion.io/x/pkg/errors"
"yunion.io/x/pkg/utils"
)

type sGeneralPCIDevice struct {
Expand All @@ -41,7 +45,7 @@ func newGeneralPCIDevice(dev *PCIDevice, devType string) *sGeneralPCIDevice {
}
}

func getPassthroughPCIDevs(devModel IsolatedDeviceModel) ([]*sGeneralPCIDevice, error) {
func getPassthroughPCIDevs(devModel IsolatedDeviceModel, filteredCodes []string) ([]*sGeneralPCIDevice, error) {
ret, err := bashOutput(fmt.Sprintf("lspci -d %s:%s -nnmm", devModel.VendorId, devModel.DeviceId))
if err != nil {
return nil, err
Expand All @@ -54,12 +58,20 @@ func getPassthroughPCIDevs(devModel IsolatedDeviceModel) ([]*sGeneralPCIDevice,
}

devs := []*sGeneralPCIDevice{}
errs := make([]error, 0)
for _, line := range lines {
dev := NewPCIDevice2(line)
if dev.ModelName == "" {
dev.ModelName = devModel.Model
}
if utils.IsInStringArray(dev.ClassCode, filteredCodes) {
continue
}
if err := dev.checkSameIOMMUGroupDevice(); err != nil {
errs = append(errs, errors.Wrapf(err, "get dev %s iommu group devices by model: %s", dev.Addr, jsonutils.Marshal(devModel)))
continue
}
devs = append(devs, newGeneralPCIDevice(dev, devModel.DevType))
}
return devs, nil
return devs, errors.NewAggregate(errs)
}

0 comments on commit c646b3c

Please sign in to comment.