-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
enhance: prevent multiple query nodes from causing excessive occupanc…
…y of a single node, leading to GPU memory overflow (#39276) (#38617) issue: #39276 Signed-off-by: yusheng.ma <[email protected]>
- Loading branch information
1 parent
0df2c75
commit 38881bf
Showing
7 changed files
with
179 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
//go:build !cuda | ||
// +build !cuda | ||
|
||
package hardware | ||
|
||
import "github.com/cockroachdb/errors" | ||
|
||
// GPUMemoryInfo holds information about a GPU's memory | ||
type GPUMemoryInfo struct { | ||
TotalMemory uint64 // Total memory available on the GPU | ||
FreeMemory uint64 // Free memory available on the GPU | ||
} | ||
|
||
// GetAllGPUMemoryInfo returns mock GPU memory information for non-CUDA builds | ||
func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) { | ||
// Mock error to indicate no CUDA support | ||
return nil, errors.New("CUDA not supported: failed to retrieve GPU memory info or no GPUs found") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
//go:build cuda | ||
// +build cuda | ||
|
||
package hardware | ||
|
||
/* | ||
#cgo CFLAGS: -I/usr/local/cuda/include | ||
#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart | ||
#include <cuda_runtime.h> | ||
#include <stdlib.h> | ||
// Structure to store GPU memory info | ||
typedef struct { | ||
size_t totalMemory; | ||
size_t freeMemory; | ||
} GPUMemoryInfo; | ||
// Function to get memory info for all GPUs | ||
int getAllGPUMemoryInfo(GPUMemoryInfo** infos) { | ||
int deviceCount = 0; | ||
cudaError_t err = cudaGetDeviceCount(&deviceCount); | ||
if (err != cudaSuccess || deviceCount == 0) { | ||
return 0; // No GPUs found or error occurred | ||
} | ||
// Allocate memory for the output array | ||
*infos = (GPUMemoryInfo*)malloc(deviceCount * sizeof(GPUMemoryInfo)); | ||
if (*infos == NULL) { | ||
return 0; // Memory allocation failed | ||
} | ||
for (int i = 0; i < deviceCount; ++i) { | ||
if (cudaSetDevice(i) != cudaSuccess) { | ||
(*infos)[i].totalMemory = 0; | ||
(*infos)[i].freeMemory = 0; | ||
continue; // Skip if the device cannot be set | ||
} | ||
size_t freeMem = 0, totalMem = 0; | ||
if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess) { | ||
(*infos)[i].totalMemory = 0; | ||
(*infos)[i].freeMemory = 0; | ||
continue; // Skip if memory info cannot be fetched | ||
} | ||
(*infos)[i].totalMemory = totalMem; | ||
(*infos)[i].freeMemory = freeMem; | ||
} | ||
return deviceCount; // Return the number of devices processed | ||
} | ||
*/ | ||
import "C" | ||
import ( | ||
"github.com/cockroachdb/errors" | ||
"unsafe" | ||
) | ||
|
||
// GPUMemoryInfo represents a single GPU's memory information. | ||
type GPUMemoryInfo struct { | ||
TotalMemory uint64 // Total memory in bytes | ||
FreeMemory uint64 // Free memory in bytes | ||
} | ||
|
||
// GetAllGPUMemoryInfo retrieves the memory information for all available GPUs. | ||
// It returns a slice of GPUMemoryInfo and an error if no GPUs are found or retrieval fails. | ||
func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) { | ||
var infos *C.GPUMemoryInfo | ||
|
||
// Call the C function to retrieve GPU memory info | ||
deviceCount := int(C.getAllGPUMemoryInfo(&infos)) | ||
if deviceCount == 0 { | ||
return nil, errors.New("failed to retrieve GPU memory info or no GPUs found") | ||
} | ||
defer C.free(unsafe.Pointer(infos)) // Free the allocated memory | ||
|
||
// Convert C array to Go slice | ||
gpuInfos := make([]GPUMemoryInfo, 0, deviceCount) | ||
infoArray := (*[1 << 30]C.GPUMemoryInfo)(unsafe.Pointer(infos))[:deviceCount:deviceCount] | ||
|
||
for i := 0; i < deviceCount; i++ { | ||
info := infoArray[i] | ||
gpuInfos = append(gpuInfos, GPUMemoryInfo{ | ||
TotalMemory: uint64(info.totalMemory), | ||
FreeMemory: uint64(info.freeMemory), | ||
}) | ||
} | ||
|
||
return gpuInfos, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters