Skip to content

Commit

Permalink
WIP: prometheus_exporter iteration 10
Browse files Browse the repository at this point in the history
- Make backfill of data (with --max-days) optional, defaulting to only
  querying since the last poll frequency.

- Use "updated" instead of "created" when searching for metrics.
  Using "created" had the issue of never being updated and so if a
playbook was seen "running", for example, it may never be seen by
prometheus as "completed" because our window of search had long gone.
  By using "updated", the playbook, host and tasks' statuses will
eventually be updated so they will come back around to be picked up
after completion.

- Add a grafana panel for showing basic metrics for the prometheus
  exporter itself
  • Loading branch information
dmsimard committed Jul 21, 2023
1 parent 6283872 commit c92b29b
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 25 deletions.
56 changes: 33 additions & 23 deletions ara/cli/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@


# TODO: This could be made more flexible and live in a library
def get_search_results(client, kind, limit, created_after):
def get_search_results(client, kind, limit, updated_after):
"""
kind: string, one of ["playbooks", "hosts", "tasks"]
limit: int, the number of items to return per page
created_after: string, a date formatted as such: 2020-01-31T15:45:36.737000Z
updated_after: string, a date formatted as such: 2020-01-31T15:45:36.737000Z
"""
query = f"/api/v1/{kind}?order=-id&limit={limit}"
if created_after is not None:
query += f"&created_after={created_after}"
if updated_after is not None:
query += f"&updated_after={updated_after}"

response = client.get(query)
items = response["results"]
Expand Down Expand Up @@ -80,11 +80,11 @@ def __init__(self, client, log, limit, labels=DEFAULT_PLAYBOOK_LABELS):
}
self.metrics["range"].set(self.limit)

def collect_metrics(self, created_after=None):
playbooks = get_search_results(self.client, "playbooks", self.limit, created_after)
def collect_metrics(self, updated_after=None):
playbooks = get_search_results(self.client, "playbooks", self.limit, updated_after)
# Save the most recent timestamp so we only scrape beyond it next time
if playbooks:
created_after = cli_utils.increment_timestamp(playbooks[0]["created"])
updated_after = cli_utils.increment_timestamp(playbooks[0]["updated"])
self.log.info(f"updating metrics for {len(playbooks)} playbooks...")

for playbook in playbooks:
Expand All @@ -106,7 +106,7 @@ def collect_metrics(self, created_after=None):
self.metrics["playbooks"].labels(**labels).observe(seconds)
self.metrics["total"].inc()

return created_after
return updated_after


class AraTaskCollector(object):
Expand All @@ -123,11 +123,11 @@ def __init__(self, client, log, limit, labels=DEFAULT_TASK_LABELS):
}
self.metrics["range"].set(self.limit)

def collect_metrics(self, created_after=None):
tasks = get_search_results(self.client, "tasks", self.limit, created_after)
def collect_metrics(self, updated_after=None):
tasks = get_search_results(self.client, "tasks", self.limit, updated_after)
# Save the most recent timestamp so we only scrape beyond it next time
if tasks:
created_after = cli_utils.increment_timestamp(tasks[0]["created"])
updated_after = cli_utils.increment_timestamp(tasks[0]["updated"])
self.log.info(f"updating metrics for {len(tasks)} tasks...")

for task in tasks:
Expand All @@ -149,7 +149,7 @@ def collect_metrics(self, created_after=None):
self.metrics["tasks"].labels(**labels).observe(seconds)
self.metrics["total"].inc()

return created_after
return updated_after


class AraHostCollector(object):
Expand All @@ -170,11 +170,11 @@ def __init__(self, client, log, limit, labels=DEFAULT_HOST_LABELS):
}
self.metrics["range"].set(self.limit)

def collect_metrics(self, created_after=None):
hosts = get_search_results(self.client, "hosts", self.limit, created_after)
def collect_metrics(self, updated_after=None):
hosts = get_search_results(self.client, "hosts", self.limit, updated_after)
# Save the most recent timestamp so we only scrape beyond it next time
if hosts:
created_after = cli_utils.increment_timestamp(hosts[0]["created"])
updated_after = cli_utils.increment_timestamp(hosts[0]["updated"])
self.log.info(f"updating metrics for {len(hosts)} hosts...")

for host in hosts:
Expand All @@ -189,7 +189,7 @@ def collect_metrics(self, created_after=None):
if host[status]:
self.metrics[status].labels(**labels).set(host[status])

return created_after
return updated_after


class PrometheusExporter(Command):
Expand Down Expand Up @@ -221,8 +221,8 @@ def get_parser(self, prog_name):
)
parser.add_argument(
'--poll-frequency',
help='Seconds to wait until querying ara for new metrics (default: 60)',
default=60,
help='Seconds to wait until querying ara for new metrics (default: 30)',
default=30,
type=int
)
parser.add_argument(
Expand All @@ -231,6 +231,12 @@ def get_parser(self, prog_name):
default=8001,
type=int
)
parser.add_argument(
'--backfill',
help='Enable backfill of playbook metrics from the past',
default=False,
action="store_true"
)
parser.add_argument(
'--max-days',
help='Maximum number of days to backfill metrics for (default: 90)',
Expand Down Expand Up @@ -267,12 +273,16 @@ def take_action(self, args):
start_http_server(args.prometheus_port)
self.log.info(f"ara prometheus exporter listening on http://0.0.0.0:{args.prometheus_port}/metrics")

created_after = (datetime.now() - timedelta(days=args.max_days)).isoformat()
self.log.info(
f"Backfilling metrics for the last {args.max_days} days since {created_after}... This can take a while."
)
# Query ara for data updated since the last poll (or up to a number of days if backfilling is enabled)
if args.backfill:
updated_after = (datetime.now() - timedelta(days=args.max_days)).isoformat()
self.log.info(
f"Backfilling metrics for the last {args.max_days} days since {updated_after}... This can take a while."
)
else:
updated_after = (datetime.now() - timedelta(seconds=args.poll_frequency)).isoformat()

latest = defaultdict(lambda: created_after)
latest = defaultdict(lambda: updated_after)
while True:
latest["playbooks"] = playbooks.collect_metrics(latest["playbooks"])
latest["hosts"] = hosts.collect_metrics(latest["hosts"])
Expand Down
135 changes: 133 additions & 2 deletions contrib/grafana/ara-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 13,
"panels": [],
"title": "Playbooks",
"type": "row"
},
Expand Down Expand Up @@ -1821,6 +1823,135 @@
],
"title": "Host failed results by name",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 111
},
"id": 21,
"panels": [],
"title": "Prometheus exporter",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 112
},
"id": 22,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "increase(ara_playbooks_total[$__rate_interval])",
"legendFormat": "Playbooks",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "increase(ara_tasks_total[$__rate_interval])",
"hide": false,
"legendFormat": "Tasks",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "increase(ara_hosts_total[$__rate_interval])",
"hide": false,
"legendFormat": "Hosts",
"range": true,
"refId": "C"
}
],
"title": "Ingested metrics",
"type": "timeseries"
}
],
"refresh": "",
Expand All @@ -1831,13 +1962,13 @@
"list": []
},
"time": {
"from": "now-30m",
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Ansible metrics (by ara)",
"uid": "e0717f1a-4bb5-4373-b177-a9f5a498962d",
"version": 4,
"version": 7,
"weekStart": ""
}

0 comments on commit c92b29b

Please sign in to comment.