Skip to content

Commit

Permalink
views,*: Improved NVidia support + fixes
Browse files Browse the repository at this point in the history
Improve the NVidia parser-file, add stub list- and info-views
for NVIDIADriver, fix a bug in the logparser key_value rule
(hopefully without introducing new issues), and add a bit
of highliting for some error states.

Signed-off-by: David Weinehall <[email protected]>
  • Loading branch information
taotriad committed Oct 26, 2024
1 parent b66df71 commit a063d51
Show file tree
Hide file tree
Showing 10 changed files with 210 additions and 4 deletions.
4 changes: 2 additions & 2 deletions clustermanagementtoolkit/logparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2075,8 +2075,8 @@ def key_value(message: str, **kwargs: Any) -> Tuple[str, LogLevel, str,
collector_bullets = deep_get(options, DictPath("collector_bullets"), False)
is_event: bool = deep_get(options, DictPath("is_event"), False)

# Split all key=value pairs.
key_value_regex = re.compile(r"^(.*?)=(.*)")
# Split all key=value pairs. Make sure not to process "=="
key_value_regex = re.compile(r"^(.*?[^=])=($|[^=].*$)")
tmp = re.findall(r"(?:\".*?\"|\S)+", message.replace("\\\"", "<<<quote>>>"))
# pylint: disable-next=too-many-nested-blocks
if tmp is not None:
Expand Down
55 changes: 55 additions & 0 deletions parsers/nvidia.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,68 @@
- name: "nvidia"
show_in_selector: false
matchkeys:
- image_name: "/nvidia/k8s-device-plugin"
- image_name: "/nvidia/cloud-native/k8s-driver-manager"
container_type: "init_container"
parser_rules:
- name: "glog"
- name: "custom_line"
options:
block_start:
- matchtype: "exact"
matchkey: "Running with config:"
matchline: "any"
format_block_start: true
block_end:
- matchtype: "exact"
matchkey: "}"
matchline: "any"
format_block_end: true
loglevel: "info"
- name: "nvidia"
show_in_selector: false
matchkeys:
- image_name: "/nvidia/k8s/container-toolkit"
- image_name: "/nvidia/k8s/dcgm-exporter"
- image_name: "/nvidia/cloud-native/gpu-operator-validator"
container_type: "init_container"
parser_rules:
- name: "key_value"
- name: "override_severity"
overrides:
- matchtype: "startswith"
matchkey: "version: "
loglevel: "notice"
- matchtype: "startswith"
matchkey: "NVIDIA-SMI has failed"
loglevel: "error"
- matchtype: "startswith"
matchkey: "CRITICAL:"
loglevel: "critical"
- name: "custom_line"
options:
block_start:
- matchtype: "exact"
matchkey: "Using config:"
matchline: "any"
format_block_start: true
block_end:
- matchtype: "startswith"
matchkey: "time"
matchline: "any"
format_block_end: false
process_block_end: false
loglevel: "info"
- name: "custom_line"
options:
block_start:
- matchtype: "startswith"
matchkey: "+---------------"
matchline: "any"
format_block_start: true
block_end:
- matchtype: "startswith"
matchkey: "+---------------"
matchline: "any"
format_block_end: true
loglevel: "info"
43 changes: 42 additions & 1 deletion views/Challenge.acme.cert-manager.io.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,46 @@ listview:
group: "Certificate Management"
field_indexes:
Normal:
fields: ["namespace", "name", "age"]
fields: ["namespace", "name", "state", "age"]
sortcolumn: "namespace"
fields:
state:
header: "State:"
path: "status#state"
type: "str"
formatter: "value_mapper"
formatting:
mapping:
mappings:
errored:
field_colors:
- context: "main"
type: "status_not_ok"
expired:
field_colors:
- context: "main"
type: "status_warning"
invalid:
field_colors:
- context: "main"
type: "status_not_ok"
pending:
field_colors:
- context: "main"
type: "status_pending"
processing:
field_colors:
- context: "main"
type: "status_pending"
ready:
field_colors:
- context: "main"
type: "status_done"
valid:
field_colors:
- context: "main"
type: "status_ok"
__default:
field_colors:
- context: "main"
type: "status_unknown"
10 changes: 9 additions & 1 deletion views/ClusterPolicy.nvidia.com.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ command:
- "nvclrpol"
listview:
name: "Cluster Policies"
group: "API & Extendability"
group: "Accelerators"
field_indexes:
Normal:
fields: ["name", "state", "age"]
Expand All @@ -24,10 +24,18 @@ listview:
formatting:
mapping:
mappings:
ignored:
field_colors:
- context: "main"
type: "status_ok"
notReady:
field_colors:
- context: "main"
type: "status_not_ok"
ready:
field_colors:
- context: "main"
type: "status_ok"
infoview:
name: "Cluster Policy Info"
infopad:
Expand Down
8 changes: 8 additions & 0 deletions views/Event.events.k8s.io.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,10 @@ listview:
field_colors:
- context: "main"
type: "status_ok"
GPUDriverUpgrade:
field_colors:
- context: "main"
type: "status_ok"
HcoUpdateError:
field_colors:
- context: "main"
Expand Down Expand Up @@ -2466,6 +2470,10 @@ infoview:
field_colors:
- context: "main"
type: "status_ok"
GPUDriverUpgrade:
field_colors:
- context: "main"
type: "status_ok"
HcoUpdateError:
field_colors:
- context: "main"
Expand Down
8 changes: 8 additions & 0 deletions views/Event.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,10 @@ listview:
field_colors:
- context: "main"
type: "status_ok"
GPUDriverUpgrade:
field_colors:
- context: "main"
type: "status_ok"
HcoUpdateError:
field_colors:
- context: "main"
Expand Down Expand Up @@ -2461,6 +2465,10 @@ infoview:
field_colors:
- context: "main"
type: "status_ok"
GPUDriverUpgrade:
field_colors:
- context: "main"
type: "status_ok"
HcoUpdateError:
field_colors:
- context: "main"
Expand Down
43 changes: 43 additions & 0 deletions views/NVIDIADriver.nvidia.com.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
kind: "NVIDIADriver"
api_family: "nvidia.com"
default_command: "nvidiadrivers"
command:
- "nvidiadrvs"
- "nvidiadrv"
- "nvdrvs"
- "nvdrv"
listview:
name: "NVIDIA Drivers"
group: "Accelerators"
field_indexes:
Normal:
fields: ["name", "driver_type", "state", "age"]
fields:
driver_type:
header: "Driver Type:"
path: "spec#driverType"
default: "gpu"
type: "str"
state:
header: "State:"
path: "status#state"
type: "str"
formatter: "value_mapper"
formatting:
mapping:
mappings:
ignored:
field_colors:
- context: "main"
type: "status_ok"
notReady:
field_colors:
- context: "main"
type: "status_not_ok"
ready:
field_colors:
- context: "main"
type: "status_ok"
infoview:
name: "NVIDIA Driver Info"
infopad:
3 changes: 3 additions & 0 deletions views/NodeFeatureGroup.nfd.k8s-sigs.io.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ listview:
field_indexes:
Normal:
fields: ["namespace", "name", "age"]
infoview:
name: "Node Feature Group Info"
infopad:
36 changes: 36 additions & 0 deletions views/Order.acme.cert-manager.io.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,39 @@ listview:
header: "State:"
path: "status#state"
type: "str"
formatter: "value_mapper"
formatting:
mapping:
mappings:
errored:
field_colors:
- context: "main"
type: "status_not_ok"
expired:
field_colors:
- context: "main"
type: "status_warning"
invalid:
field_colors:
- context: "main"
type: "status_not_ok"
pending:
field_colors:
- context: "main"
type: "status_pending"
processing:
field_colors:
- context: "main"
type: "status_pending"
ready:
field_colors:
- context: "main"
type: "status_done"
valid:
field_colors:
- context: "main"
type: "status_ok"
__default:
field_colors:
- context: "main"
type: "status_unknown"
4 changes: 4 additions & 0 deletions views/variables/event_reasons.var
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,10 @@ Generated:
field_colors:
- context: "main"
type: "status_ok"
GPUDriverUpgrade:
field_colors:
- context: "main"
type: "status_ok"
HcoUpdateError:
field_colors:
- context: "main"
Expand Down

0 comments on commit a063d51

Please sign in to comment.