fix: agent panic when node is terminated during step execution (#3331)

Fixes https://github.com/woodpecker-ci/woodpecker/issues/3330

This adds error handling on the agent's WaitStep function, on two
sections where it could encounter a `panic: runtime error: invalid
memory address or nil pointer dereference` in case it could no longer
access complete information about a specific pod.

This error was found to happen if the node in which the pod was running
was terminated during the step's execution.
spite active pipelines being executed on the node.

Now instead of a panic on the agent's logs and undefined behavior on the
UI it will display a more helpful error message on the UI.

### Additional context

We observed the bug first on v2.1.1, but tested the fix internally on
top of 2.3.0.


![image](https://github.com/woodpecker-ci/woodpecker/assets/7269710/dfbcf089-85f7-4b5d-8102-f21af95c5cda)
This commit is contained in:
Fernando Barbosa 2024-02-05 18:46:14 -03:00 committed by GitHub
parent e324d18a74
commit c7467b9828
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -263,11 +263,23 @@ func (e *kube) WaitStep(ctx context.Context, step *types.Step, taskUUID string)
}
if isImagePullBackOffState(pod) {
return nil, fmt.Errorf("could not pull image for pod %s", pod.Name)
return nil, fmt.Errorf("could not pull image for pod %s", podName)
}
if len(pod.Status.ContainerStatuses) == 0 {
return nil, fmt.Errorf("no container statuses found for pod %s", podName)
}
cs := pod.Status.ContainerStatuses[0]
if cs.State.Terminated == nil {
err := fmt.Errorf("no terminated state found for container %s/%s", podName, cs.Name)
log.Error().Str("taskUUID", taskUUID).Str("pod", podName).Str("container", cs.Name).Interface("state", cs.State).Msg(err.Error())
return nil, err
}
bs := &types.State{
ExitCode: int(pod.Status.ContainerStatuses[0].State.Terminated.ExitCode),
ExitCode: int(cs.State.Terminated.ExitCode),
Exited: true,
OOMKilled: false,
}