From c92b29bb5f45321b60ed626f241096f027adca45 Mon Sep 17 00:00:00 2001
From: David Moreau Simard <moi@dmsimard.com>
Date: Thu, 20 Jul 2023 20:32:53 -0400
Subject: [PATCH] WIP: prometheus_exporter iteration 10

- Make backfill of data (with --max-days) optional, defaulting to only
  querying since the last poll frequency.

- Use "updated" instead of "created" when searching for metrics.
  Using "created" had the issue of never being updated and so if a
playbook was seen "running", for example, it may never be seen by
prometheus as "completed" because our window of search had long gone.
  By using "updated", the playbook, host and tasks' statuses will
eventually be updated so they will come back around to be picked up
after completion.

- Add a grafana panel for showing basic metrics for the prometheus
  exporter itself
---
 ara/cli/prometheus.py              |  56 +++++++-----
 contrib/grafana/ara-dashboard.json | 135 ++++++++++++++++++++++++++++-
 2 files changed, 166 insertions(+), 25 deletions(-)

diff --git a/ara/cli/prometheus.py b/ara/cli/prometheus.py
index 8326b62f..d3cb1a61 100644
--- a/ara/cli/prometheus.py
+++ b/ara/cli/prometheus.py
@@ -40,15 +40,15 @@
 
 
 # TODO: This could be made more flexible and live in a library
-def get_search_results(client, kind, limit, created_after):
+def get_search_results(client, kind, limit, updated_after):
     """
     kind: string, one of ["playbooks", "hosts", "tasks"]
     limit: int, the number of items to return per page
-    created_after: string, a date formatted as such: 2020-01-31T15:45:36.737000Z
+    updated_after: string, a date formatted as such: 2020-01-31T15:45:36.737000Z
     """
     query = f"/api/v1/{kind}?order=-id&limit={limit}"
-    if created_after is not None:
-        query += f"&created_after={created_after}"
+    if updated_after is not None:
+        query += f"&updated_after={updated_after}"
 
     response = client.get(query)
     items = response["results"]
@@ -80,11 +80,11 @@ def __init__(self, client, log, limit, labels=DEFAULT_PLAYBOOK_LABELS):
         }
         self.metrics["range"].set(self.limit)
 
-    def collect_metrics(self, created_after=None):
-        playbooks = get_search_results(self.client, "playbooks", self.limit, created_after)
+    def collect_metrics(self, updated_after=None):
+        playbooks = get_search_results(self.client, "playbooks", self.limit, updated_after)
         # Save the most recent timestamp so we only scrape beyond it next time
         if playbooks:
-            created_after = cli_utils.increment_timestamp(playbooks[0]["created"])
+            updated_after = cli_utils.increment_timestamp(playbooks[0]["updated"])
             self.log.info(f"updating metrics for {len(playbooks)} playbooks...")
 
         for playbook in playbooks:
@@ -106,7 +106,7 @@ def collect_metrics(self, created_after=None):
             self.metrics["playbooks"].labels(**labels).observe(seconds)
             self.metrics["total"].inc()
 
-        return created_after
+        return updated_after
 
 
 class AraTaskCollector(object):
@@ -123,11 +123,11 @@ def __init__(self, client, log, limit, labels=DEFAULT_TASK_LABELS):
         }
         self.metrics["range"].set(self.limit)
 
-    def collect_metrics(self, created_after=None):
-        tasks = get_search_results(self.client, "tasks", self.limit, created_after)
+    def collect_metrics(self, updated_after=None):
+        tasks = get_search_results(self.client, "tasks", self.limit, updated_after)
         # Save the most recent timestamp so we only scrape beyond it next time
         if tasks:
-            created_after = cli_utils.increment_timestamp(tasks[0]["created"])
+            updated_after = cli_utils.increment_timestamp(tasks[0]["updated"])
             self.log.info(f"updating metrics for {len(tasks)} tasks...")
 
         for task in tasks:
@@ -149,7 +149,7 @@ def collect_metrics(self, created_after=None):
             self.metrics["tasks"].labels(**labels).observe(seconds)
             self.metrics["total"].inc()
 
-        return created_after
+        return updated_after
 
 
 class AraHostCollector(object):
@@ -170,11 +170,11 @@ def __init__(self, client, log, limit, labels=DEFAULT_HOST_LABELS):
         }
         self.metrics["range"].set(self.limit)
 
-    def collect_metrics(self, created_after=None):
-        hosts = get_search_results(self.client, "hosts", self.limit, created_after)
+    def collect_metrics(self, updated_after=None):
+        hosts = get_search_results(self.client, "hosts", self.limit, updated_after)
         # Save the most recent timestamp so we only scrape beyond it next time
         if hosts:
-            created_after = cli_utils.increment_timestamp(hosts[0]["created"])
+            updated_after = cli_utils.increment_timestamp(hosts[0]["updated"])
             self.log.info(f"updating metrics for {len(hosts)} hosts...")
 
         for host in hosts:
@@ -189,7 +189,7 @@ def collect_metrics(self, created_after=None):
                 if host[status]:
                     self.metrics[status].labels(**labels).set(host[status])
 
-        return created_after
+        return updated_after
 
 
 class PrometheusExporter(Command):
@@ -221,8 +221,8 @@ def get_parser(self, prog_name):
         )
         parser.add_argument(
             '--poll-frequency',
-            help='Seconds to wait until querying ara for new metrics (default: 60)',
-            default=60,
+            help='Seconds to wait until querying ara for new metrics (default: 30)',
+            default=30,
             type=int
         )
         parser.add_argument(
@@ -231,6 +231,12 @@ def get_parser(self, prog_name):
             default=8001,
             type=int
         )
+        parser.add_argument(
+            '--backfill',
+            help='Enable backfill of playbook metrics from the past',
+            default=False,
+            action="store_true"
+        )
         parser.add_argument(
             '--max-days',
             help='Maximum number of days to backfill metrics for (default: 90)',
@@ -267,12 +273,16 @@ def take_action(self, args):
         start_http_server(args.prometheus_port)
         self.log.info(f"ara prometheus exporter listening on http://0.0.0.0:{args.prometheus_port}/metrics")
 
-        created_after = (datetime.now() - timedelta(days=args.max_days)).isoformat()
-        self.log.info(
-            f"Backfilling metrics for the last {args.max_days} days since {created_after}... This can take a while."
-        )
+        # Query ara for data updated since the last poll (or up to a number of days if backfilling is enabled)
+        if args.backfill:
+            updated_after = (datetime.now() - timedelta(days=args.max_days)).isoformat()
+            self.log.info(
+                f"Backfilling metrics for the last {args.max_days} days since {updated_after}... This can take a while."
+            )
+        else:
+            updated_after = (datetime.now() - timedelta(seconds=args.poll_frequency)).isoformat()
 
-        latest = defaultdict(lambda: created_after)
+        latest = defaultdict(lambda: updated_after)
         while True:
             latest["playbooks"] = playbooks.collect_metrics(latest["playbooks"])
             latest["hosts"] = hosts.collect_metrics(latest["hosts"])
diff --git a/contrib/grafana/ara-dashboard.json b/contrib/grafana/ara-dashboard.json
index 56f1bc1f..a0c9e4b5 100644
--- a/contrib/grafana/ara-dashboard.json
+++ b/contrib/grafana/ara-dashboard.json
@@ -60,6 +60,7 @@
   "liveNow": false,
   "panels": [
     {
+      "collapsed": false,
       "gridPos": {
         "h": 1,
         "w": 24,
@@ -67,6 +68,7 @@
         "y": 0
       },
       "id": 13,
+      "panels": [],
       "title": "Playbooks",
       "type": "row"
     },
@@ -1821,6 +1823,135 @@
       ],
       "title": "Host failed results by name",
       "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 111
+      },
+      "id": 21,
+      "panels": [],
+      "title": "Prometheus exporter",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 112
+      },
+      "id": 22,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "increase(ara_playbooks_total[$__rate_interval])",
+          "legendFormat": "Playbooks",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "increase(ara_tasks_total[$__rate_interval])",
+          "hide": false,
+          "legendFormat": "Tasks",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "increase(ara_hosts_total[$__rate_interval])",
+          "hide": false,
+          "legendFormat": "Hosts",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Ingested metrics",
+      "type": "timeseries"
     }
   ],
   "refresh": "",
@@ -1831,13 +1962,13 @@
     "list": []
   },
   "time": {
-    "from": "now-30m",
+    "from": "now-6h",
     "to": "now"
   },
   "timepicker": {},
   "timezone": "",
   "title": "Ansible metrics (by ara)",
   "uid": "e0717f1a-4bb5-4373-b177-a9f5a498962d",
-  "version": 4,
+  "version": 7,
   "weekStart": ""
 }
\ No newline at end of file