-
Notifications
You must be signed in to change notification settings - Fork 14
node condtion remediation #126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4,14 +4,20 @@ | |||||||||||||||||||||||||||||||||||||||||||
| "context" | ||||||||||||||||||||||||||||||||||||||||||||
| "fmt" | ||||||||||||||||||||||||||||||||||||||||||||
| "os" | ||||||||||||||||||||||||||||||||||||||||||||
| "os/exec" | ||||||||||||||||||||||||||||||||||||||||||||
| "path/filepath" | ||||||||||||||||||||||||||||||||||||||||||||
| "strconv" | ||||||||||||||||||||||||||||||||||||||||||||
| "strings" | ||||||||||||||||||||||||||||||||||||||||||||
| "sync" | ||||||||||||||||||||||||||||||||||||||||||||
| "sync/atomic" | ||||||||||||||||||||||||||||||||||||||||||||
| "time" | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| "github.com/sirupsen/logrus" | ||||||||||||||||||||||||||||||||||||||||||||
| "github.com/spf13/cobra" | ||||||||||||||||||||||||||||||||||||||||||||
| "google.golang.org/grpc" | ||||||||||||||||||||||||||||||||||||||||||||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||||||||||||||||||||||||||||||||||||||||
| "k8s.io/client-go/kubernetes" | ||||||||||||||||||||||||||||||||||||||||||||
| "k8s.io/client-go/tools/clientcmd" | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| _ "github.com/Azure/AKSFlexNode/components" | ||||||||||||||||||||||||||||||||||||||||||||
| "github.com/Azure/AKSFlexNode/components/services/inmem" | ||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -231,6 +237,7 @@ | |||||||||||||||||||||||||||||||||||||||||||
| if driftEnabled { | ||||||||||||||||||||||||||||||||||||||||||||
| startNodeDriftDetectionAndRemediationLoop(ctx, cfg, conn, logger, cfgMu, bootstrapInProgress, detectors, wg) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
| startNodeConditionLoop(ctx, cfg, logger, wg) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| func snapshotConfig(cfg *config.Config, cfgMu *sync.RWMutex) *config.Config { | ||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -460,3 +467,116 @@ | |||||||||||||||||||||||||||||||||||||||||||
| // For bootstrap, return error on failure | ||||||||||||||||||||||||||||||||||||||||||||
| return fmt.Errorf("%s failed: %s", operation, result.Error) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| func getBootTime() (time.Time, error) { | ||||||||||||||||||||||||||||||||||||||||||||
| data, err := os.ReadFile("/proc/uptime") | ||||||||||||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||||||||||||
| return time.Time{}, fmt.Errorf("failed to read /proc/uptime: %w", err) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| // /proc/uptime contains two numbers: uptime in seconds and idle time | ||||||||||||||||||||||||||||||||||||||||||||
| // We only need the first number | ||||||||||||||||||||||||||||||||||||||||||||
| fields := strings.Fields(string(data)) | ||||||||||||||||||||||||||||||||||||||||||||
| if len(fields) < 1 { | ||||||||||||||||||||||||||||||||||||||||||||
| return time.Time{}, fmt.Errorf("invalid /proc/uptime format") | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| uptimeSeconds, err := strconv.ParseFloat(fields[0], 64) | ||||||||||||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||||||||||||
| return time.Time{}, fmt.Errorf("failed to parse uptime: %w", err) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| // Calculate boot time: current time - uptime | ||||||||||||||||||||||||||||||||||||||||||||
| bootTime := time.Now().Add(-time.Duration(uptimeSeconds * float64(time.Second))) | ||||||||||||||||||||||||||||||||||||||||||||
| return bootTime, nil | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| func getNodeName() (string, error) { | ||||||||||||||||||||||||||||||||||||||||||||
| data, err := os.ReadFile("/etc/hostname") | ||||||||||||||||||||||||||||||||||||||||||||
| if err != nil { | ||||||||||||||||||||||||||||||||||||||||||||
| return "", fmt.Errorf("failed to read /etc/hostname: %w", err) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| nodeName := strings.TrimSpace(string(data)) | ||||||||||||||||||||||||||||||||||||||||||||
| if nodeName == "" { | ||||||||||||||||||||||||||||||||||||||||||||
| return "", fmt.Errorf("node name is empty in /etc/hostname") | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| return nodeName, nil | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| func rebootNode() error { | ||||||||||||||||||||||||||||||||||||||||||||
| rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", | ||||||||||||||||||||||||||||||||||||||||||||
| "/bin/bash", "-c", "echo b > /proc/sysrq-trigger") | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| return rebootCmd.Run() | ||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+508
to
+512
|
||||||||||||||||||||||||||||||||||||||||||||
| rebootCmd := exec.Command("/usr/bin/nsenter", "-m/proc/1/ns/mnt", | |
| "/bin/bash", "-c", "echo b > /proc/sysrq-trigger") | |
| return rebootCmd.Run() | |
| // Use a bounded context so the reboot command can't hang indefinitely. | |
| ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) | |
| defer cancel() | |
| // #nosec G204 -- command and arguments are constant literals; no user input is interpolated. | |
| rebootCmd := exec.CommandContext(ctx, "/usr/bin/nsenter", "-m/proc/1/ns/mnt", | |
| "/bin/bash", "-c", "echo b > /proc/sysrq-trigger") | |
| output, err := rebootCmd.CombinedOutput() | |
| if ctx.Err() == context.DeadlineExceeded { | |
| return fmt.Errorf("reboot command timed out: %w; output: %s", err, strings.TrimSpace(string(output))) | |
| } | |
| if err != nil { | |
| return fmt.Errorf("reboot command failed: %w; output: %s", err, strings.TrimSpace(string(output))) | |
| } | |
| return nil |
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This introduces an unconditional host reboot action (via sysrq-trigger) in the main daemon loop. Consider gating this behavior behind an explicit config/feature flag (similar to EnableDriftDetectionAndRemediation) and adding rate limiting/guardrails to reduce the risk of reboot loops or unexpected reboots in environments that don’t want automated power actions.
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
startDaemonLoops pre-increments the WaitGroup counter for the loops it starts, but startNodeConditionLoop also calls wg.Add(1) internally. This hidden increment is inconsistent with the other loops and makes it easier to accidentally introduce a WaitGroup misuse/panic later; consider moving the Add(1) into startDaemonLoops and removing it from here.
| wg.Add(1) |
Fixed
Show fixed
Hide fixed
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
config, err := clientcmd.BuildConfigFromFlags(..., "/var/lib/kubelet/kubelet/kubeconfig") hardcodes a path that already exists as config.KubeletKubeconfigPath and also introduces a local variable named config that shadows the imported pkg/config identifier. Use the shared constant and rename the local to something like kubeConfig to avoid confusion.
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this loop, errors when loading kubeconfig cause a return, which stops the goroutine permanently and prevents any future node-condition checks. Consider logging the error and continue to the next tick (or implement a backoff) so transient failures don’t disable remediation for the lifetime of the agent process.
| return | |
| } |
runzhen marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clientcmd.BuildConfigFromFlags and kubernetes.NewForConfig are executed on every tick. Since these typically only depend on local files and can be reused, consider creating the REST config/clientset once outside the ticker loop and reusing them (recreating only on failure) to reduce per-minute I/O and allocations.
Outdated
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These error paths return from the goroutine, which permanently stops node-condition monitoring after a transient failure (e.g., kubeconfig temporarily unavailable). Prefer logging and continue to the next tick so the daemon can self-recover like the other loops in this file.
runzhen marked this conversation as resolved.
Show resolved
Hide resolved
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
defer cancel() is inside a long-running for loop, so the cancels will be deferred until the goroutine exits (potentially never), leaking per-iteration resources. Scope the timeout context to a small inner function/block and call cancel() at the end of each iteration instead of deferring in the outer loop.
| // Get the node | |
| node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) | |
| // Get the node with a per-call context to avoid leaking resources across iterations | |
| ctx, cancel := context.WithCancel(context.Background()) | |
| node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) | |
| cancel() |
runzhen marked this conversation as resolved.
Show resolved
Hide resolved
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the Node GET fails, the code logs the error but then continues and dereferences node.Status.Conditions, which will panic when node is nil. This should return/continue on error (and avoid attempting remediation) so the agent doesn’t crash the loop on API failures.
Copilot
AI
Mar 12, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the Nodes().Get call returns an error, node can be nil; the code then ranges over node.Status.Conditions, which can panic. Handle the error by returning/continuing before dereferencing node (or guard against nil).
Uh oh!
There was an error while loading. Please reload this page.