Skip to content

Commit

Permalink
feat: implement retina shell CLI command (#962)
Browse files Browse the repository at this point in the history
# Description

Implement `retina shell` CLI command for adhoc network debugging of
nodes and pods.

## Related Issue

#910 

## Checklist

- [x] I have read the [contributing
documentation](https://retina.sh/docs/contributing).
- [x] I signed and signed-off the commits (`git commit -S -s ...`). See
[this
documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification)
on signing commits.
- [x] I have correctly attributed the author(s) of the code.
- [x] I have tested the changes locally.
- [x] I have followed the project's style guidelines.
- [x] I have updated the documentation, if necessary.
- [x] I have added tests, if applicable.

## Screenshots (if applicable) or Testing Completed

<img width="695" alt="image"
src="https://github.com/user-attachments/assets/9a534ec3-34d9-448d-9f0f-0915db4daa8e">

## Additional Notes

For testing, need to set `RETINA_SHELL_IMAGE_REPO` and
`RETINA_SHELL_IMAGE_VERSION` until the retina-shell image is published.

---

Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more
information on how to contribute to this project.

---------

Signed-off-by: Will Daly <[email protected]>
  • Loading branch information
wedaly authored Dec 4, 2024
1 parent ebca883 commit f134af6
Show file tree
Hide file tree
Showing 10 changed files with 773 additions and 0 deletions.
173 changes: 173 additions & 0 deletions cli/cmd/shell.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package cmd

import (
"errors"
"fmt"
"os"
"time"

"github.com/microsoft/retina/internal/buildinfo"
"github.com/microsoft/retina/shell"
"github.com/spf13/cobra"
v1 "k8s.io/api/core/v1"
"k8s.io/cli-runtime/pkg/genericclioptions"
"k8s.io/cli-runtime/pkg/resource"
cmdutil "k8s.io/kubectl/pkg/cmd/util"
"k8s.io/kubectl/pkg/scheme"
"k8s.io/kubectl/pkg/util/templates"
)

var (
configFlags *genericclioptions.ConfigFlags
matchVersionFlags *cmdutil.MatchVersionFlags
retinaShellImageRepo string
retinaShellImageVersion string
mountHostFilesystem bool
allowHostFilesystemWrite bool
hostPID bool
capabilities []string
timeout time.Duration
)

var (
// AKS requires clusters to allow access to MCR, so use this repository by default.
defaultRetinaShellImageRepo = "mcr.microsoft.com/containernetworking/retina-shell"

// Default version is the same as CLI version, set at link time.
defaultRetinaShellImageVersion = buildinfo.Version

defaultTimeout = 30 * time.Second

errMissingRequiredRetinaShellImageVersionArg = errors.New("missing required --retina-shell-image-version")
errUnsupportedResourceType = errors.New("unsupported resource type")
)

var shellCmd = &cobra.Command{
Use: "shell (NODE | TYPE[[.VERSION].GROUP]/NAME)",
Short: "[EXPERIMENTAL] Interactively debug a node or pod",
Long: templates.LongDesc(`
[EXPERIMENTAL] This is an experimental command. The flags and behavior may change in the future.
Start a shell with networking tools in a node or pod for adhoc debugging.
* For nodes, this creates a pod on the node in the root network namespace.
* For pods, this creates an ephemeral container inside the pod's network namespace.
You can override the default image used for the shell container with either
CLI flags (--retina-shell-image-repo and --retina-shell-image-version) or
environment variables (RETINA_SHELL_IMAGE_REPO and RETINA_SHELL_IMAGE_VERSION).
CLI flags take precedence over env vars.
`),

Example: templates.Examples(`
# start a shell in a node
kubectl retina shell node0001
# start a shell in a node, with debug pod in kube-system namespace
kubectl retina shell -n kube-system node0001
# start a shell as an ephemeral container inside an existing pod
kubectl retina shell -n kube-system pod/coredns-d459997b4-7cpzx
# start a shell in a node, mounting the host filesystem to /host with ability to chroot
kubectl retina shell node001 --mount-host-filesystem --capabilities SYS_CHROOT
# start a shell in a node, with NET_RAW and NET_ADMIN capabilities
# (required for iptables and tcpdump)
kubectl retina shell node001 --capabilities NET_RAW,NET_ADMIN
`),
Args: cobra.ExactArgs(1),
RunE: func(_ *cobra.Command, args []string) error {
// retinaShellImageVersion defaults to the CLI version, but that might not be set if the CLI is built without -ldflags.
if retinaShellImageVersion == "" {
return errMissingRequiredRetinaShellImageVersionArg
}

namespace, explicitNamespace, err := matchVersionFlags.ToRawKubeConfigLoader().Namespace()
if err != nil {
return fmt.Errorf("error retrieving namespace arg: %w", err)
}

// This interprets the first arg as either a node or pod (same as kubectl):
// "node001" -> node
// "node/node001" -> node
// "pod/example-7cpzx" -> pod
r := resource.NewBuilder(configFlags).
WithScheme(scheme.Scheme, scheme.Scheme.PrioritizedVersionsAllGroups()...).
FilenameParam(explicitNamespace, &resource.FilenameOptions{}).
NamespaceParam(namespace).DefaultNamespace().ResourceNames("nodes", args[0]).
Do()
if rerr := r.Err(); rerr != nil {
return fmt.Errorf("error constructing resource builder: %w", rerr)
}

restConfig, err := matchVersionFlags.ToRESTConfig()
if err != nil {
return fmt.Errorf("error constructing REST config: %w", err)
}

config := shell.Config{
RestConfig: restConfig,
RetinaShellImage: fmt.Sprintf("%s:%s", retinaShellImageRepo, retinaShellImageVersion),
MountHostFilesystem: mountHostFilesystem,
AllowHostFilesystemWrite: allowHostFilesystemWrite,
HostPID: hostPID,
Capabilities: capabilities,
Timeout: timeout,
}

return r.Visit(func(info *resource.Info, err error) error {
if err != nil {
return err
}

switch obj := info.Object.(type) {
case *v1.Node:
podDebugNamespace := namespace
nodeName := obj.Name
return shell.RunInNode(config, nodeName, podDebugNamespace)
case *v1.Pod:
return shell.RunInPod(config, obj.Namespace, obj.Name)
default:
gvk := obj.GetObjectKind().GroupVersionKind()
return fmt.Errorf("unsupported resource %s/%s: %w", gvk.GroupVersion(), gvk.Kind, errUnsupportedResourceType)
}
})
},
}

func init() {
Retina.AddCommand(shellCmd)
shellCmd.PersistentPreRun = func(cmd *cobra.Command, _ []string) {
// Avoid printing full usage message if the command exits with an error.
cmd.SilenceUsage = true
cmd.SilenceErrors = true

// Allow setting image repo and version via environment variables (CLI flags still take precedence).
if !cmd.Flags().Changed("retina-shell-image-repo") {
if envRepo := os.Getenv("RETINA_SHELL_IMAGE_REPO"); envRepo != "" {
retinaShellImageRepo = envRepo
}
}
if !cmd.Flags().Changed("retina-shell-image-version") {
if envVersion := os.Getenv("RETINA_SHELL_IMAGE_VERSION"); envVersion != "" {
retinaShellImageVersion = envVersion
}
}
}
shellCmd.Flags().StringVar(&retinaShellImageRepo, "retina-shell-image-repo", defaultRetinaShellImageRepo, "The container registry repository for the image to use for the shell container")
shellCmd.Flags().StringVar(&retinaShellImageVersion, "retina-shell-image-version", defaultRetinaShellImageVersion, "The version (tag) of the image to use for the shell container")
shellCmd.Flags().BoolVarP(&mountHostFilesystem, "mount-host-filesystem", "m", false, "Mount the host filesystem to /host. Applies only to nodes, not pods.")
shellCmd.Flags().BoolVarP(&allowHostFilesystemWrite, "allow-host-filesystem-write", "w", false,
"Allow write access to the host filesystem. Implies --mount-host-filesystem. Applies only to nodes, not pods.")
shellCmd.Flags().BoolVar(&hostPID, "host-pid", false, "Set HostPID on the shell container. Applies only to nodes, not pods.")
shellCmd.Flags().StringSliceVarP(&capabilities, "capabilities", "c", []string{}, "Add capabilities to the shell container")
shellCmd.Flags().DurationVar(&timeout, "timeout", defaultTimeout, "The maximum time to wait for the shell container to start")

// configFlags and matchVersion flags are used to load kubeconfig.
// This uses the same mechanism as `kubectl debug` to connect to apiserver and attach to containers.
configFlags = genericclioptions.NewConfigFlags(true)
configFlags.AddFlags(shellCmd.PersistentFlags())
matchVersionFlags = cmdutil.NewMatchVersionFlags(configFlags)
matchVersionFlags.AddFlags(shellCmd.PersistentFlags())
}
185 changes: 185 additions & 0 deletions docs/06-Troubleshooting/shell.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# Shell TSG

**EXPERIMENTAL: `retina shell` is an experimental feature, so the flags and behavior may change in future versions.**

The `retina shell` command allows you to start an interactive shell on a Kubernetes node or pod. This runs a container image with many common networking tools installed (`ping`, `curl`, etc.).

## Testing connectivity

Start a shell on a node or inside a pod

```bash
# To start a shell in a node (root network namespace):
kubectl retina shell aks-nodepool1-15232018-vmss000001

# To start a shell inside a pod (pod network namespace):
kubectl retina shell -n kube-system pods/coredns-d459997b4-7cpzx
```

Check connectivity using `ping`:

```text
root [ / ]# ping 10.224.0.4
PING 10.224.0.4 (10.224.0.4) 56(84) bytes of data.
64 bytes from 10.224.0.4: icmp_seq=1 ttl=64 time=0.964 ms
64 bytes from 10.224.0.4: icmp_seq=2 ttl=64 time=1.13 ms
64 bytes from 10.224.0.4: icmp_seq=3 ttl=64 time=0.908 ms
64 bytes from 10.224.0.4: icmp_seq=4 ttl=64 time=1.07 ms
64 bytes from 10.224.0.4: icmp_seq=5 ttl=64 time=1.01 ms
--- 10.224.0.4 ping statistics ---
5 packets transmitted, 5 received, 0% packet loss, time 4022ms
rtt min/avg/max/mdev = 0.908/1.015/1.128/0.077 ms
```

Check DNS resolution using `dig`:

```text
root [ / ]# dig example.com +short
93.184.215.14
```

The tools `nslookup` and `drill` are also available if you prefer those.

Check connectivity to apiserver using `nc` and `curl`:

```text
root [ / ]# nc -zv 10.0.0.1 443
Ncat: Version 7.95 ( https://nmap.org/ncat )
Ncat: Connected to 10.0.0.1:443.
Ncat: 0 bytes sent, 0 bytes received in 0.06 seconds.
root [ / ]# curl -k https://10.0.0.1
{
"kind": "Status",
"apiVersion": "v1",
"metadata": {},
"status": "Failure",
"message": "Unauthorized",
"reason": "Unauthorized",
"code": 401
}
```

### nftables and iptables

Accessing nftables and iptables rules requires `NET_RAW` and `NET_ADMIN` capabilities.

```bash
kubectl retina shell aks-nodepool1-15232018-vmss000002 --capabilities NET_ADMIN,NET_RAW
```

Then you can run `iptables` and `nft`:

```text
root [ / ]# iptables -nvL | head -n 2
Chain INPUT (policy ACCEPT 1191K packets, 346M bytes)
pkts bytes target prot opt in out source destination
root [ / ]# nft list ruleset | head -n 2
# Warning: table ip filter is managed by iptables-nft, do not touch!
table ip filter {
```

**If you see the error "Operation not permitted (you must be root)", check that your `kubectl retina shell` command sets `--capabilities NET_RAW,NET_ADMIN`.**

`iptables` in the shell image uses `iptables-legacy`, which may or may not match the configuration on the node. For example, Ubuntu maps `iptables` to `iptables-nft`. To use the exact same `iptables` binary as installed on the node, you will need to `chroot` into the host filesystem (see below).

## Accessing the host filesystem

On nodes, you can mount the host filesystem to `/host`:

```bash
kubectl retina shell aks-nodepool1-15232018-vmss000002 --mount-host-filesystem
```

This mounts the host filesystem (`/`) to `/host` in the debug pod:

```text
root [ / ]# ls /host
NOTICE.txt bin boot dev etc home lib lib64 libx32 lost+found media mnt opt proc root run sbin srv sys tmp usr var
```

The host filesystem is mounted read-only by default. If you need write access, use the `--allow-host-filesystem-write` flag.

Symlinks between files on the host filesystem may not resolve correctly. If you see "No such file or directory" errors for symlinks, try following the instructions below to `chroot` to the host filesystem.

## Chroot to the host filesystem

`chroot` requires the `SYS_CHROOT` capability:

```bash
kubectl retina shell aks-nodepool1-15232018-vmss000002 --mount-host-filesystem --capabilities SYS_CHROOT
```

Then you can use `chroot` to switch to start a shell inside the host filesystem:

```text
root [ / ]# chroot /host bash
root@aks-nodepool1-15232018-vmss000002:/# cat /etc/resolv.conf | tail -n 2
nameserver 168.63.129.16
search shncgv2kgepuhm1ls1dwgholsd.cx.internal.cloudapp.net
```

`chroot` allows you to:

* Execute binaries installed on the node.
* Resolve symlinks that point to files in the host filesystem (such as /etc/resolv.conf -> /run/systemd/resolve/resolv.conf)
* Use `sysctl` to view or modify kernel parameters.
* Use `journalctl` to view systemd unit and kernel logs.
* Use `ip netns` to view network namespaces. (However, `ip netns exec` does not work.)

## Systemctl

`systemctl` commands require both `chroot` to the host filesystem and host PID:

```bash
kubectl retina shell aks-nodepool1-15232018-vmss000002 --mount-host-filesystem --capabilities SYS_CHROOT --host-pid
```

Then `chroot` to the host filesystem and run `systemctl status`:

```text
root [ / ]# chroot /host systemctl status | head -n 2
● aks-nodepool1-15232018-vmss000002
State: running
```

**If `systemctl` shows an error "Failed to connect to bus: No data available", check that the `retina shell` command has `--host-pid` set and that you have chroot'd to /host.**

## Troubleshooting

### Timeouts

If `kubectl retina shell` fails with a timeout error, then:

1. Increase the timeout by setting `--timeout` flag.
2. Check the pod using `kubectl describe pod` to determine why retina shell is failing to start.

Example:

```bash
kubectl retina shell --timeout 10m node001 # increase timeout to 10 minutes
```

### Firewalls and ImagePullBackoff

Some clusters are behind a firewall that blocks pulling the retina-shell image. To workaround this:

1. Replicate the retina-shell images to a container registry accessible from within the cluster.
2. Override the image used by Retina CLI with the environment variable `RETINA_SHELL_IMAGE_REPO`.

Example:

```bash
export RETINA_SHELL_IMAGE_REPO="example.azurecr.io/retina/retina-shell"
export RETINA_SHELL_IMAGE_VERSION=v0.0.1 # optional, if not set defaults to the Retina CLI version.
kubectl retina shell node0001 # this will use the image "example.azurecr.io/retina/retina-shell:v0.0.1"
```

## Limitations

* Windows nodes and pods are not yet supported.
* `bpftool` and `bpftrace` are not supported.
* The shell image link `iptables` commands to `iptables-legacy`, even if the node itself links to `iptables-nft`.
* `nsenter` is not supported.
* `ip netns` will not work without `chroot` to the host filesystem.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,11 @@ require (
github.com/evanphx/json-patch v5.9.0+incompatible // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
github.com/fatih/camelcase v1.0.0 // indirect
github.com/fatih/color v1.16.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fvbommel/sortorder v1.1.0 // indirect
github.com/go-errors/errors v1.4.2 // indirect
github.com/go-gorp/gorp/v3 v3.1.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0
github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ=
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d h1:105gxyaGwCFad8crR9dcMQWvV9Hvulu6hwUh4tWPJnM=
github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d/go.mod h1:ZZMPRZwes7CROmyNKgQzC3XPs6L/G2EJLHddWejkmf4=
github.com/fatih/camelcase v1.0.0 h1:hxNvNX/xYBp0ovncs8WyWZrOrpBNub/JfaMvbURyft8=
github.com/fatih/camelcase v1.0.0/go.mod h1:yN2Sb0lFhZJUdVvtELVWefmrXpuZESvPmqwoZc+/fpc=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU=
github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
Expand All @@ -314,6 +316,8 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/fvbommel/sortorder v1.1.0 h1:fUmoe+HLsBTctBDoaBwpQo5N+nrCp8g/BjKb/6ZQmYw=
github.com/fvbommel/sortorder v1.1.0/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/go-chi/chi v4.1.2+incompatible h1:fGFk2Gmi/YKXk0OmGfBh0WgmN3XB8lVnEyNz34tQRec=
github.com/go-chi/chi v4.1.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ=
Expand Down
Loading

0 comments on commit f134af6

Please sign in to comment.