Skip to content

Commit

Permalink
Merge branch 'master' into improve-metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
tiithansen authored Dec 13, 2024
2 parents fb40b23 + 3c14ee0 commit 8c32cd2
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 35 deletions.
17 changes: 15 additions & 2 deletions .github/actions/execute-assert-arc-e2e/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -188,15 +188,28 @@ runs:
}
core.setFailed(`The triggered workflow run didn't finish properly using ${{inputs.arc-name}}`)
- name: Gather listener logs
shell: bash
if: always()
run: |
LISTENER_POD="$(kubectl get autoscalinglisteners.actions.github.com -n arc-systems -o jsonpath='{.items[*].metadata.name}')"
kubectl logs $LISTENER_POD -n ${{inputs.arc-controller-namespace}}
- name: Gather coredns logs
shell: bash
if: always()
run: |
kubectl logs deployments/coredns -n kube-system
- name: cleanup
if: inputs.wait-to-finish == 'true'
shell: bash
run: |
helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug
kubectl wait --timeout=30s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-namespace}} -l app.kubernetes.io/instance=${{ inputs.arc-name }}
- name: Gather logs and cleanup
- name: Gather controller logs
shell: bash
if: always()
run: |
kubectl logs deployment/arc-gha-rs-controller -n ${{inputs.arc-controller-namespace}}
kubectl logs deployment/arc-gha-rs-controller -n ${{inputs.arc-controller-namespace}}
16 changes: 16 additions & 0 deletions .github/workflows/gha-e2e-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -194,6 +196,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -284,6 +288,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -383,6 +389,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -484,6 +492,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -579,6 +589,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -699,6 +711,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Test ARC E2E
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down Expand Up @@ -789,6 +803,8 @@ jobs:
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
kubectl get pod -n arc-systems
sleep 60
- name: Trigger long running jobs and wait for runners to pick them up
uses: ./.github/actions/execute-assert-arc-e2e
timeout-minutes: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ spec:
{{- with .Values.flags.watchSingleNamespace }}
- "--watch-single-namespace={{ . }}"
{{- end }}
{{- with .Values.runnerMaxConcurrentReconciles }}
- "--runner-max-concurrent-reconciles={{ . }}"
{{- end }}
{{- with .Values.flags.updateStrategy }}
- "--update-strategy={{ . }}"
{{- end }}
Expand Down
5 changes: 5 additions & 0 deletions charts/gha-runner-scale-set-controller/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ flags:
## Defaults to watch all namespaces when unset.
# watchSingleNamespace: ""

## The maximum number of concurrent reconciles which can be run by the EphemeralRunner controller.
# Increase this value to improve the throughput of the controller.
# It may also increase the load on the API server and the external service (e.g. GitHub API).
runnerMaxConcurrentReconciles: 2

## Defines how the controller should handle upgrades while having running jobs.
##
## The strategies available are:
Expand Down
69 changes: 46 additions & 23 deletions controllers/actions.github.com/ephemeralrunner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ

if ephemeralRunner.Status.RunnerId == 0 {
log.Info("Creating new ephemeral runner registration and updating status with runner config")
return r.updateStatusWithRunnerConfig(ctx, ephemeralRunner, log)
if r, err := r.updateStatusWithRunnerConfig(ctx, ephemeralRunner, log); r != nil {
return *r, err
}
}

secret := new(corev1.Secret)
Expand All @@ -189,7 +191,17 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}
// create secret if not created
log.Info("Creating new ephemeral runner secret for jitconfig.")
return r.createSecret(ctx, ephemeralRunner, log)
if r, err := r.createSecret(ctx, ephemeralRunner, log); r != nil {
return *r, err
}

// Retry to get the secret that was just created.
// Otherwise, even though we want to continue to create the pod,
// it fails due to the missing secret resulting in an invalid pod spec.
if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
log.Error(err, "Failed to fetch secret")
return ctrl.Result{}, err
}
}

pod := new(corev1.Pod)
Expand Down Expand Up @@ -511,12 +523,12 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem

// updateStatusWithRunnerConfig fetches runtime configuration needed by the runner
// This method should always set .status.runnerId and .status.runnerJITConfig
func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ctrl.Result, error) {
func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) {
// Runner is not registered with the service. We need to register it first
log.Info("Creating ephemeral runner JIT config")
actionsClient, err := r.actionsClientFor(ctx, ephemeralRunner)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to get actions client for generating JIT config: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to get actions client for generating JIT config: %v", err)
}

jitSettings := &actions.RunnerScaleSetJitRunnerSetting{
Expand All @@ -534,12 +546,12 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con
if err != nil {
actionsError := &actions.ActionsError{}
if !errors.As(err, &actionsError) {
return ctrl.Result{}, fmt.Errorf("failed to generate JIT config with generic error: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with generic error: %v", err)
}

if actionsError.StatusCode != http.StatusConflict ||
!actionsError.IsException("AgentExistsException") {
return ctrl.Result{}, fmt.Errorf("failed to generate JIT config with Actions service error: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with Actions service error: %v", err)
}

// If the runner with the name we want already exists it means:
Expand All @@ -552,29 +564,29 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con
log.Info("Getting runner jit config failed with conflict error, trying to get the runner by name", "runnerName", ephemeralRunner.Name)
existingRunner, err := actionsClient.GetRunnerByName(ctx, ephemeralRunner.Name)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to get runner by name: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to get runner by name: %v", err)
}

if existingRunner == nil {
log.Info("Runner with the same name does not exist, re-queuing the reconciliation")
return ctrl.Result{Requeue: true}, nil
return &ctrl.Result{Requeue: true}, nil
}

log.Info("Found the runner with the same name", "runnerId", existingRunner.Id, "runnerScaleSetId", existingRunner.RunnerScaleSetId)
if existingRunner.RunnerScaleSetId == ephemeralRunner.Spec.RunnerScaleSetId {
log.Info("Removing the runner with the same name")
err := actionsClient.RemoveRunner(ctx, int64(existingRunner.Id))
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to remove runner from the service: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to remove runner from the service: %v", err)
}

log.Info("Removed the runner with the same name, re-queuing the reconciliation")
return ctrl.Result{Requeue: true}, nil
return &ctrl.Result{Requeue: true}, nil
}

// TODO: Do we want to mark the ephemeral runner as failed, and let EphemeralRunnerSet to clean it up, so we can recover from this situation?
// The situation is that the EphemeralRunner's name is already used by something else to register a runner, and we can't take the control back.
return ctrl.Result{}, fmt.Errorf("runner with the same name but doesn't belong to this RunnerScaleSet: %v", err)
return &ctrl.Result{}, fmt.Errorf("runner with the same name but doesn't belong to this RunnerScaleSet: %v", err)
}
log.Info("Created ephemeral runner JIT config", "runnerId", jitConfig.Runner.Id)

Expand All @@ -585,11 +597,20 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con
obj.Status.RunnerJITConfig = jitConfig.EncodedJITConfig
})
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %v", err)
}

// We want to continue without a requeue for faster pod creation.
//
// To do so, we update the status in-place, so that both continuing the loop and
// and requeuing and skipping updateStatusWithRunnerConfig in the next loop, will
// have the same effect.
ephemeralRunner.Status.RunnerId = jitConfig.Runner.Id
ephemeralRunner.Status.RunnerName = jitConfig.Runner.Name
ephemeralRunner.Status.RunnerJITConfig = jitConfig.EncodedJITConfig

log.Info("Updated ephemeral runner status with runnerId and runnerJITConfig")
return ctrl.Result{}, nil
return nil, nil
}

func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alpha1.EphemeralRunner, secret *corev1.Secret, log logr.Logger) (ctrl.Result, error) {
Expand Down Expand Up @@ -665,21 +686,21 @@ func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alp
return ctrl.Result{}, nil
}

func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (ctrl.Result, error) {
func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) {
log.Info("Creating new secret for ephemeral runner")
jitSecret := r.ResourceBuilder.newEphemeralRunnerJitSecret(runner)

if err := ctrl.SetControllerReference(runner, jitSecret, r.Scheme); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set controller reference: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to set controller reference: %v", err)
}

log.Info("Created new secret spec for ephemeral runner")
if err := r.Create(ctx, jitSecret); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to create jit secret: %v", err)
return &ctrl.Result{}, fmt.Errorf("failed to create jit secret: %v", err)
}

log.Info("Created ephemeral runner secret", "secretName", jitSecret.Name)
return ctrl.Result{Requeue: true}, nil
return nil, nil
}

// updateRunStatusFromPod is responsible for updating non-exiting statuses.
Expand Down Expand Up @@ -823,12 +844,14 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context,
}

// SetupWithManager sets up the controller with the Manager.
func (r *EphemeralRunnerReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&v1alpha1.EphemeralRunner{}).
Owns(&corev1.Pod{}).
WithEventFilter(predicate.ResourceVersionChangedPredicate{}).
Complete(r)
func (r *EphemeralRunnerReconciler) SetupWithManager(mgr ctrl.Manager, opts ...Option) error {
return builderWithOptions(
ctrl.NewControllerManagedBy(mgr).
For(&v1alpha1.EphemeralRunner{}).
Owns(&corev1.Pod{}).
WithEventFilter(predicate.ResourceVersionChangedPredicate{}),
opts,
).Complete(r)
}

func runnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
Expand Down
56 changes: 56 additions & 0 deletions controllers/actions.github.com/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package actionsgithubcom

import (
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/controller"
)

// Options is the optional configuration for the controllers, which can be
// set via command-line flags or environment variables.
type Options struct {
// RunnerMaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run
// by the EphemeralRunnerController.
RunnerMaxConcurrentReconciles int
}

// OptionsWithDefault returns the default options.
// This is here to maintain the options and their default values in one place,
// rather than having to correlate those in multiple places.
func OptionsWithDefault() Options {
return Options{
RunnerMaxConcurrentReconciles: 2,
}
}

type Option func(*controller.Options)

// WithMaxConcurrentReconciles sets the maximum number of concurrent Reconciles which can be run.
//
// This is useful to improve the throughput of the controller, but it may also increase the load on the API server and
// the external service (e.g. GitHub API). The default value is 1, as defined by the controller-runtime.
//
// See https://github.com/actions/actions-runner-controller/issues/3021 for more information
// on real-world use cases and the potential impact of this option.
func WithMaxConcurrentReconciles(n int) Option {
return func(b *controller.Options) {
b.MaxConcurrentReconciles = n
}
}

// builderWithOptions applies the given options to the provided builder, if any.
// This is a helper function to avoid the need to import the controller-runtime package in every reconciler source file
// and the command package that creates the controller.
// This is also useful for reducing code duplication around setting controller options in
// multiple reconcilers.
func builderWithOptions(b *builder.Builder, opts []Option) *builder.Builder {
if len(opts) == 0 {
return b
}

var controllerOpts controller.Options
for _, opt := range opts {
opt(&controllerOpts)
}

return b.WithOptions(controllerOpts)
}
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ module github.com/actions/actions-runner-controller
go 1.22.4

require (
github.com/bradleyfalzon/ghinstallation/v2 v2.8.0
github.com/bradleyfalzon/ghinstallation/v2 v2.12.0
github.com/davecgh/go-spew v1.1.1
github.com/evanphx/json-patch v5.9.0+incompatible
github.com/go-logr/logr v1.4.1
github.com/golang-jwt/jwt/v4 v4.5.0
github.com/golang-jwt/jwt/v4 v4.5.1
github.com/google/go-cmp v0.6.0
github.com/google/go-github/v52 v52.0.0
github.com/google/uuid v1.6.0
Expand Down Expand Up @@ -60,7 +60,7 @@ require (
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/gnostic-models v0.6.9-0.20230804172637-c7be7c783f49 // indirect
github.com/google/go-github/v56 v56.0.0 // indirect
github.com/google/go-github/v66 v66.0.0 // indirect
github.com/google/go-querystring v1.1.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20231101202521-4ca4178f5c7a // indirect
Expand Down
Loading

0 comments on commit 8c32cd2

Please sign in to comment.