Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{Draft}{AKS} Troubleshooting command line tool #8150

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
58 changes: 58 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/im/data collection.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Data Collection

## vmss run-command in azure-cli

### use case: invoke

```bash
az vmss run-command invoke -g MC_azcli-aks-dev_dev100_westus2 -n aks-nodepool1-28844989-vmss --command-id RunShellScript --instance-id 0 --scripts 'for i in $(seq $1 $2); do echo $i; done' --parameters 1 100000
```

- synchronous operation, need to wait until the operation is completed
- the output will be truncated and cannot be automatically exported to external storage

### use case: CRUD

```bash
az vmss run-command list -g MC_azcli-aks-dev_dev100_westus2 --vmss-name aks-nodepool1-28844989-vmss --instance-id "0"
az vmss run-command show -g MC_azcli-aks-dev_dev100_westus2 -n aks-nodepool1-28844989-vmss --instance-id 0 --name

# run command for the first time
az vmss run-command create -g MC_azcli-aks-dev_dev100_westus2 --vmss-name aks-nodepool1-28844989-vmss --instance-id "0" --run-command-name "t1" --script 'for i in $(seq $abc $xyz); do echo $i; done' --parameters abc=1 xyz=100000
az vmss run-command show -g MC_azcli-aks-dev_dev100_westus2 --vmss-name aks-nodepool1-28844989-vmss --instance-id "0" --run-command-name "t1" --instance-view # show command result

# run command for the second time
az vmss run-command update -g MC_azcli-aks-dev_dev100_westus2 --vmss-name aks-nodepool1-28844989-vmss --instance-id "0" --run-command-name "t1" --script 'for i in $(seq $abc $xyz); do echo $i; done' --parameters abc=1 xyz=1000000 --output-blob-uri "https://aksclidebug.blob.core.windows.net/aksclidebug/abc?xxx"

# generate storage account container sas token
end=`date -u -d "30 minutes" '+%Y-%m-%dT%H:%MZ'`
az storage container generate-sas --account-name aksclidebug -n aksclidebug --permissions acrw --expiry $end --https-only

# run command for the third time
az vmss run-command update -g MC_azcli-aks-dev_dev100_westus2 --vmss-name aks-nodepool1-28844989-vmss --instance-id "0" --run-command-name "t1" --script 'for i in $(seq $abc $xyz); do echo $i; done' --parameters abc=1 xyz=1000000 --output-blob-uri "https://aksclidebug.blob.core.windows.net/aksclidebug/xyz?xxx"
```

- asynchronous operation, could be executed multiple times via update command
- the output will be truncated, but it can be automatically exported to external storage in its entirety

## kubectl debug

### use busybox to debug

```bash
node_name=$(kubectl get no -o json | jq -r '.items[0].metadata.name')
kubectl debug no/${node_name} -i --image=mcr.microsoft.com/cbl-mariner/busybox:2.0
busybox_pod_name=$(kubectl get po -o json | jq '.items[]|select(.status.phase=="Running")|select(.spec.containers[0].image=="mcr.microsoft.com/cbl-mariner/busybox:2.0")|.metadata.name')
kubectl exec ${busybox_pod_name} -- nslookup google.com
```

### get journal log

```bash
node_name=$(kubectl get no -o json | jq -r '.items[0].metadata.name')
kubectl debug no/${node_name} -i --image=mcr.microsoft.com/cbl-mariner/base/core:2.0
debug_pod_name=$(kubectl get po -o json | jq '.items[]|select(.status.phase=="Running")|select(.spec.containers[0].image=="mcr.microsoft.com/cbl-mariner/base/core:2.0")|.metadata.name')
kubectl exec ${debug_pod_name} -- tdnf install systemd tar -y
kubectl exec ${debug_pod_name} -- chroot /host sh -c "journalctl > journal.log"
kubectl cp ${debug_pod_name}:/host/journal.log journal.log
```
56 changes: 56 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/im/data_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import Dict
from random import randint


class DataCollector:
def __init__(self) -> None:
self.data = None

def run(self) -> None:
# some code to collect data
pass

def get_data(self, refresh_cached_data=False):
if self.data is None or refresh_cached_data:
self.run()
return self.data

def gc():
# clean up the resources used by the data collector
pass

def export():
# export the data to a file/remote storage
pass


class DataCollectorCoreDNSConfigMap(DataCollector):
def __init__(self) -> None:
super().__init__()

def run(self) -> None:
# some code to collect data
self.data = "a" if randint(0, 1) else "b"


class DataCollectorIGDNS(DataCollector):
def __init__(self) -> None:
super().__init__()

def run(self) -> None:
# some code to collect data
self.data = "c" if randint(0, 1) else "d"


class SharedDataCollector():
def __init__(self) -> None:
self.data_collectors: Dict[str, DataCollector] = {
"core_dns_config_map": DataCollectorCoreDNSConfigMap(),
"ig_dns": DataCollectorIGDNS(),
}

def get_core_dns_config_map_data(self):
return self.data_collectors["core_dns_config_map"].get_data()

def get_ig_dns_data(self):
return self.data_collectors["ig_dns"].get_data()
32 changes: 32 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/im/knowledge_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Dict, List

from .types import DebugStep, ActionStep
from .knowledge_base_debug import DebugStepA, DebugStepB, DebugStepC
from .knowledge_base_action import ActionStepA, ActionStepB, ActionStepC


class KnowledgeBase:
def __init__(self) -> None:
self.debug_steps: Dict[str, DebugStep] = {
"a": DebugStepA(),
"b": DebugStepB(),
"c": DebugStepC()
}
self.action_steps: Dict[str, ActionStep] = {
"a": ActionStepA(),
"b": ActionStepB(),
"c": ActionStepC()
}

def get_debug_step_by_name(self, name: str) -> DebugStep:
return self.debug_steps[name]

def get_action_step_by_name(self, name: str) -> ActionStep:
return self.action_steps[name]

def get_debug_steps_by_scenario(self, scenario: str) -> List[DebugStep]:
results = []
for v in self.debug_steps.values():
if scenario in v.tags:
results.append(v)
return results
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from .types import Step, ActionStep


class ActionStepA(ActionStep):
def __init__(self) -> None:
super().__init__()

def run(self) -> Step:
print(self)


class ActionStepB(ActionStep):
def __init__(self) -> None:
super().__init__()

def run(self) -> Step:
print(self)


class ActionStepC(ActionStep):
def __init__(self) -> None:
super().__init__()

def run(self) -> Step:
print(self)
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from .types import Step, DebugStep, NoActionStep
from .knowledge_base_action import ActionStepA, ActionStepB, ActionStepC


class DebugStepA(DebugStep):
def __init__(self) -> None:
super().__init__()
self.tags.append("dns")

def run(self) -> Step:
print(self)
data = self.shared_data_collector.get_core_dns_config_map_data()
if data == "a":
self.next_steps.append(ActionStepA())
elif data == "b":
self.next_steps.append(DebugStepB())
else:
self.next_steps.append(NoActionStep())


class DebugStepB(DebugStep):
def __init__(self) -> None:
super().__init__()
self.tags.append("dns")

def run(self) -> Step:
print(self)
data = self.shared_data_collector.get_ig_dns_data()
if data == "c":
self.next_steps.append(ActionStepB())
elif data == "d":
self.next_steps.append(ActionStepC())
else:
self.next_steps.append(NoActionStep())


class DebugStepC(DebugStep):
def __init__(self) -> None:
super().__init__()
self.tags.append("egress")

def run(self) -> Step:
print(self)
self.next_steps.append(NoActionStep())
26 changes: 26 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/im/orchestrator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from .data_collector import SharedDataCollector
from .knowledge_base import KnowledgeBase
from .types import DebugStep


class Orchestrator():
def __init__(self) -> None:
self.steps = []
self.shared_data_collector = SharedDataCollector()
self.knowledge_base = KnowledgeBase()

def run(self, scenario: str) -> None:
self.steps = self.knowledge_base.get_debug_steps_by_scenario(scenario)
current_steps = []
next_steps = self.steps
round_cnt = 0
while next_steps:
round_cnt += 1
print(f"Round {round_cnt}")
current_steps = next_steps
next_steps = []
for step in current_steps:
if isinstance(step, DebugStep):
step.attch_shared_data_collector(self.shared_data_collector)
step.run()
next_steps.extend(step.get_next_steps())
40 changes: 40 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/im/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations
from typing import List

from .data_collector import SharedDataCollector


class Step:
def __init__(self) -> None:
self.tags: List[str] = []
self.next_steps: List[Step] = []

def run(self) -> Step:
pass

def get_next_steps(self) -> List[Step]:
return self.next_steps


class DebugStep(Step):
def __init__(self) -> None:
super().__init__()

def attch_shared_data_collector(self, shared_data_collector: SharedDataCollector) -> None:
self.shared_data_collector = shared_data_collector


class ActionStep(Step):
def __init__(self) -> None:
super().__init__()


class NoActionStep(Step):
def __init__(self) -> None:
super().__init__()

def run(self) -> Step:
print("NoActionStep")

def get_next_steps(self) -> List[Step]:
return []
8 changes: 8 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/im/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import subprocess


def get_configmap(namespace, name):
return subprocess.check_output(
["kubectl", "get", "cm", "-n", namespace, name, "-o", "json"],
universal_newlines=True,
)
9 changes: 9 additions & 0 deletions src/aks-preview/azext_aks_preview/debug/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from im.orchestrator import Orchestrator


def main():
Orchestrator().run("dns")


if __name__ == "__main__":
main()
Loading