Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve metrics #3847

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
199 changes: 95 additions & 104 deletions cmd/ghalistener/metrics/metrics.go
Copy link

@atsu85 atsu85 Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo in commit description?

They cause a creation of new services with each job execution.

vs

They cause a creation of new series with each job execution.

Also worth mentioning cardinality explosion and OOMs

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package metrics
import (
"context"
"net/http"
"strconv"
"time"

"github.com/actions/actions-runner-controller/github/actions"
Expand All @@ -19,11 +18,9 @@ const (
labelKeyOrganization = "organization"
labelKeyRepository = "repository"
labelKeyJobName = "job_name"
labelKeyJobWorkflowRef = "job_workflow_ref"
labelKeyEventName = "event_name"
labelKeyJobResult = "job_result"
labelKeyRunnerID = "runner_id"
labelKeyRunnerName = "runner_name"
labelKeyRunnerPodName = "pod_name"
)

const githubScaleSetSubsystem = "gha"
Expand All @@ -43,14 +40,15 @@ var (
labelKeyOrganization,
labelKeyEnterprise,
labelKeyJobName,
labelKeyJobWorkflowRef,
labelKeyEventName,
}

completedJobsTotalLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
jobExecutionDurationLabels = append(jobLabels, labelKeyJobResult, labelKeyRunnerID, labelKeyRunnerName)
startedJobsTotalLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
jobStartupDurationLabels = append(jobLabels, labelKeyRunnerID, labelKeyRunnerName)
completedJobsTotalLabels = append(jobLabels, labelKeyJobResult)
lastJobExecutionDurationLabels = append(jobLabels, labelKeyJobResult)
startedJobsTotalLabels = jobLabels
lastJobStartupDurationLabels = jobLabels
jobQueueDurationLabels = jobLabels
runnerLabels = append(jobLabels, labelKeyRunnerPodName)
)

var (
Expand Down Expand Up @@ -144,97 +142,69 @@ var (
completedJobsTotalLabels,
)

jobStartupDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
// Becasue jobs might not run with uniform frequency calculating rates from histogram might not be suitable for all jobs.
// With last durations we can use prometheus <aggr>_over_time functions to display the last duration of the job.
jobLastQueueDurationSeconds = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_startup_duration_seconds",
Help: "Time spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).",
Buckets: runtimeBuckets,
Name: "job_last_queue_duration_seconds",
Help: "Last duration spent in the queue by the job (in seconds).",
},
jobStartupDurationLabels,
jobQueueDurationLabels,
)

jobExecutionDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
jobLastStartupDurationSeconds = prometheus.NewGaugeVec(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth adding a comment (in addition to commit message) why Gague is used, while ideally Histogram seems better data type - might avoid a lot of WTFs and wasting time basically reverting this change ;)

prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_execution_duration_seconds",
Help: "Time spent executing workflow jobs by the scale set (in seconds).",
Buckets: runtimeBuckets,
Name: "job_last_startup_duration_seconds",
Help: "The last duration spent waiting for workflow job to get started on the runner owned by the scale set (in seconds).",
},
jobExecutionDurationLabels,
lastJobStartupDurationLabels,
)
)

var runtimeBuckets []float64 = []float64{
0.01,
0.05,
0.1,
0.5,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
12,
15,
18,
20,
25,
30,
40,
50,
60,
70,
80,
90,
100,
110,
120,
150,
180,
210,
240,
300,
360,
420,
480,
540,
600,
900,
1200,
1800,
2400,
3000,
3600,
}
jobLastExecutionDurationSeconds = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "job_last_execution_duration_seconds",
Help: "The last duration spent executing workflow jobs by the scale set (in seconds).",
},
lastJobExecutionDurationLabels,
)

runnerJob = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: githubScaleSetSubsystem,
Name: "runner_job",
Help: "Job information for the runner.",
},
runnerLabels,
)
)

type baseLabels struct {
scaleSetName string
scaleSetNamespace string
enterprise string
organization string
repository string
scaleSetName string
scaleSetNamespace string
runnerScaleSetName string
enterprise string
organization string
repository string
}

func (b *baseLabels) jobLabels(jobBase *actions.JobMessageBase) prometheus.Labels {
return prometheus.Labels{
labelKeyEnterprise: b.enterprise,
labelKeyOrganization: jobBase.OwnerName,
labelKeyRepository: jobBase.RepositoryName,
labelKeyJobName: jobBase.JobDisplayName,
labelKeyJobWorkflowRef: jobBase.JobWorkflowRef,
labelKeyEventName: jobBase.EventName,
labelKeyRunnerScaleSetName: b.runnerScaleSetName,
labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace,
labelKeyEnterprise: b.enterprise,
labelKeyOrganization: jobBase.OwnerName,
labelKeyRepository: jobBase.RepositoryName,
labelKeyJobName: jobBase.JobDisplayName,
labelKeyEventName: jobBase.EventName,
}
}

func (b *baseLabels) scaleSetLabels() prometheus.Labels {
return prometheus.Labels{
labelKeyRunnerScaleSetName: b.scaleSetName,
labelKeyRunnerScaleSetName: b.runnerScaleSetName,
labelKeyRunnerScaleSetNamespace: b.scaleSetNamespace,
labelKeyEnterprise: b.enterprise,
labelKeyOrganization: b.organization,
Expand All @@ -244,16 +214,18 @@ func (b *baseLabels) scaleSetLabels() prometheus.Labels {

func (b *baseLabels) completedJobLabels(msg *actions.JobCompleted) prometheus.Labels {
l := b.jobLabels(&msg.JobMessageBase)
l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId)
l[labelKeyJobResult] = msg.Result
l[labelKeyRunnerName] = msg.RunnerName
return l
}

func (b *baseLabels) startedJobLabels(msg *actions.JobStarted) prometheus.Labels {
l := b.jobLabels(&msg.JobMessageBase)
l[labelKeyRunnerID] = strconv.Itoa(msg.RunnerId)
l[labelKeyRunnerName] = msg.RunnerName
return l
}

func (b *baseLabels) runnerLabels(msg *actions.JobMessageBase, runnerName string) prometheus.Labels {
l := b.jobLabels(msg)
l[labelKeyRunnerPodName] = runnerName
return l
}

Expand Down Expand Up @@ -286,14 +258,15 @@ type exporter struct {
}

type ExporterConfig struct {
ScaleSetName string
ScaleSetNamespace string
Enterprise string
Organization string
Repository string
ServerAddr string
ServerEndpoint string
Logger logr.Logger
ScaleSetName string
ScaleSetNamespace string
RunnerScaleSetName string
Enterprise string
Organization string
Repository string
ServerAddr string
ServerEndpoint string
Logger logr.Logger
}

func NewExporter(config ExporterConfig) ServerPublisher {
Expand All @@ -309,8 +282,10 @@ func NewExporter(config ExporterConfig) ServerPublisher {
idleRunners,
startedJobsTotal,
completedJobsTotal,
jobStartupDurationSeconds,
jobExecutionDurationSeconds,
jobLastQueueDurationSeconds,
jobLastStartupDurationSeconds,
jobLastExecutionDurationSeconds,
runnerJob,
)

mux := http.NewServeMux()
Expand All @@ -322,11 +297,12 @@ func NewExporter(config ExporterConfig) ServerPublisher {
return &exporter{
logger: config.Logger.WithName("metrics"),
baseLabels: baseLabels{
scaleSetName: config.ScaleSetName,
scaleSetNamespace: config.ScaleSetNamespace,
enterprise: config.Enterprise,
organization: config.Organization,
repository: config.Repository,
scaleSetName: config.ScaleSetName,
scaleSetNamespace: config.ScaleSetNamespace,
runnerScaleSetName: config.RunnerScaleSetName,
enterprise: config.Enterprise,
organization: config.Organization,
repository: config.Repository,
},
srv: &http.Server{
Addr: config.ServerAddr,
Expand Down Expand Up @@ -367,16 +343,31 @@ func (e *exporter) PublishJobStarted(msg *actions.JobStarted) {
l := e.startedJobLabels(msg)
startedJobsTotal.With(l).Inc()

startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix()
jobStartupDurationSeconds.With(l).Observe(float64(startupDuration))
if !msg.JobMessageBase.RunnerAssignTime.IsZero() && !msg.JobMessageBase.ScaleSetAssignTime.IsZero() {
startupDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.ScaleSetAssignTime.Unix()
jobLastStartupDurationSeconds.With(l).Set(float64(startupDuration))
}

if !msg.JobMessageBase.QueueTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() {
queueDuration := msg.JobMessageBase.RunnerAssignTime.Unix() - msg.JobMessageBase.QueueTime.Unix()
jobLastQueueDurationSeconds.With(l).Set(float64(queueDuration))
}

rl := e.runnerLabels(&msg.JobMessageBase, msg.RunnerName)
runnerJob.With(rl).Set(1)
}

func (e *exporter) PublishJobCompleted(msg *actions.JobCompleted) {
l := e.completedJobLabels(msg)
completedJobsTotal.With(l).Inc()

executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix()
jobExecutionDurationSeconds.With(l).Observe(float64(executionDuration))
if !msg.JobMessageBase.FinishTime.IsZero() && !msg.JobMessageBase.RunnerAssignTime.IsZero() {
executionDuration := msg.JobMessageBase.FinishTime.Unix() - msg.JobMessageBase.RunnerAssignTime.Unix()
jobLastExecutionDurationSeconds.With(l).Set(float64(executionDuration))
}

rl := e.runnerLabels(&msg.JobMessageBase, msg.RunnerName)
runnerJob.Delete(rl)
}

func (m *exporter) PublishDesiredRunners(count int) {
Expand Down
33 changes: 17 additions & 16 deletions cmd/githubrunnerscalesetlistener/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,23 @@ import (
)

type Config struct {
ConfigureUrl string `json:"configureUrl"`
AppID int64 `json:"appID"`
AppInstallationID int64 `json:"appInstallationID"`
AppPrivateKey string `json:"appPrivateKey"`
Token string `json:"token"`
EphemeralRunnerSetNamespace string `json:"ephemeralRunnerSetNamespace"`
EphemeralRunnerSetName string `json:"ephemeralRunnerSetName"`
MaxRunners int `json:"maxRunners"`
MinRunners int `json:"minRunners"`
RunnerScaleSetId int `json:"runnerScaleSetId"`
RunnerScaleSetName string `json:"runnerScaleSetName"`
ServerRootCA string `json:"serverRootCA"`
LogLevel string `json:"logLevel"`
LogFormat string `json:"logFormat"`
MetricsAddr string `json:"metricsAddr"`
MetricsEndpoint string `json:"metricsEndpoint"`
ConfigureUrl string `json:"configureUrl"`
AppID int64 `json:"appID"`
AppInstallationID int64 `json:"appInstallationID"`
AppPrivateKey string `json:"appPrivateKey"`
Token string `json:"token"`
EphemeralRunnerSetNamespace string `json:"ephemeralRunnerSetNamespace"`
EphemeralRunnerSetName string `json:"ephemeralRunnerSetName"`
MaxRunners int `json:"maxRunners"`
MinRunners int `json:"minRunners"`
RunnerScaleSetId int `json:"runnerScaleSetId"`
RunnerScaleSetName string `json:"runnerScaleSetName"`
AutoscalingRunnerScaleSetName string `json:"autoscalingRunnerScaleSetName"`
ServerRootCA string `json:"serverRootCA"`
LogLevel string `json:"logLevel"`
LogFormat string `json:"logFormat"`
MetricsAddr string `json:"metricsAddr"`
MetricsEndpoint string `json:"metricsEndpoint"`
}

func Read(path string) (Config, error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ func (r *AutoscalingListenerReconciler) createListenerPod(ctx context.Context, a

logger.Info("Creating listener config secret")

podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert)
podConfig, err := r.ResourceBuilder.newScaleSetListenerConfig(autoscalingListener, secret, metricsConfig, cert, autoscalingRunnerSet.Spec.RunnerScaleSetName)
if err != nil {
logger.Error(err, "Failed to build listener config secret")
return ctrl.Result{}, err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ func (r *EphemeralRunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.R

metrics.SetEphemeralRunnerCountsByStatus(
metrics.CommonLabels{
Name: ephemeralRunnerSet.Labels[LabelKeyGitHubScaleSetName],
Name: ephemeralRunnerSet.Annotations[AnnotationKeyGitHubRunnerScaleSetName],
Namespace: ephemeralRunnerSet.Labels[LabelKeyGitHubScaleSetNamespace],
Repository: parsedURL.Repository,
Organization: parsedURL.Organization,
Expand Down
Loading