dashboards.libsonnet 12.6 KB
Newer Older
Tom Wilkie committed
1
local g = import 'grafana-builder/grafana.libsonnet';
2 3 4 5 6 7 8 9
local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local singlestat = grafana.singlestat;
local prometheus = grafana.prometheus;
local graphPanel = grafana.graphPanel;
local tablePanel = grafana.tablePanel;
local template = grafana.template;
Tom Wilkie committed
10
{
11
  grafanaDashboards+:: {
Tom Wilkie committed
12
    'prometheus.json':
13
      g.dashboard('Prometheus Overview')
Tom Wilkie committed
14 15 16 17 18 19 20 21 22 23 24 25
      .addMultiTemplate('job', 'prometheus_build_info', 'job')
      .addMultiTemplate('instance', 'prometheus_build_info', 'instance')
      .addRow(
        g.row('Prometheus Stats')
        .addPanel(
          g.panel('Prometheus Stats') +
          g.tablePanel([
            'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
            'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
          ], {
            job: { alias: 'Job' },
            instance: { alias: 'Instance' },
26
            version: { alias: 'Version' },
Tom Wilkie committed
27 28 29 30 31 32 33 34 35
            'Value #A': { alias: 'Count', type: 'hidden' },
            'Value #B': { alias: 'Uptime' },
          })
        )
      )
      .addRow(
        g.row('Discovery')
        .addPanel(
          g.panel('Target Sync') +
36
          g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
Tom Wilkie committed
37 38 39 40
          { yaxes: g.yaxes('ms') }
        )
        .addPanel(
          g.panel('Targets') +
41
          g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
Tom Wilkie committed
42 43 44 45 46 47
          g.stack
        )
      )
      .addRow(
        g.row('Retrieval')
        .addPanel(
48 49
          g.panel('Average Scrape Interval Duration') +
          g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
Tom Wilkie committed
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
          { yaxes: g.yaxes('ms') }
        )
        .addPanel(
          g.panel('Scrape failures') +
          g.queryPanel([
            'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
          ], [
            'exceeded sample limit: {{job}}',
            'duplicate timestamp: {{job}}',
            'out of bounds: {{job}}',
            'out of order: {{job}}',
          ]) +
          g.stack
        )
        .addPanel(
          g.panel('Appended Samples') +
69
          g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
Tom Wilkie committed
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
          g.stack
        )
      )
      .addRow(
        g.row('Storage')
        .addPanel(
          g.panel('Head Series') +
          g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
          g.stack
        )
        .addPanel(
          g.panel('Head Chunks') +
          g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
          g.stack
        )
      )
      .addRow(
        g.row('Query')
        .addPanel(
          g.panel('Query Rate') +
90
          g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
Tom Wilkie committed
91 92 93 94 95 96 97 98
          g.stack,
        )
        .addPanel(
          g.panel('Stage Duration') +
          g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
          { yaxes: g.yaxes('ms') } +
          g.stack,
        )
99
      ),
beorn7 committed
100
    // Remote write specific dashboard.
101
    'prometheus-remote-write.json':
102
      local timestampComparison =
103 104 105 106
        graphPanel.new(
          'Highest Timestamp In vs. Highest Timestamp Sent',
          datasource='$datasource',
          span=6,
beorn7 committed
107
        )
108 109 110 111 112
        .addTarget(prometheus.target(
          |||
            (
              prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} 
            -  
113
              ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}
114 115
            )
          |||,
116
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
117 118
        ));

119
      local timestampComparisonRate =
120 121 122 123
        graphPanel.new(
          'Rate[5m]',
          datasource='$datasource',
          span=6,
beorn7 committed
124
        )
125 126 127 128 129
        .addTarget(prometheus.target(
          |||
            (
              rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])  
            - 
130
              ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
131 132
            )
          |||,
133
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
134 135 136 137 138 139 140
        ));

      local samplesRate =
        graphPanel.new(
          'Rate, in vs. succeeded or dropped [5m]',
          datasource='$datasource',
          span=12,
beorn7 committed
141
        )
142 143 144 145 146
        .addTarget(prometheus.target(
          |||
            rate(
              prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
            - 
147
              ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])
148 149 150
            - 
              rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])
          |||,
151
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
152 153
        ));

154
      local currentShards =
155
        graphPanel.new(
156
          'Current Shards',
157 158 159
          datasource='$datasource',
          span=12,
          min_span=6,
160 161 162
        )
        .addTarget(prometheus.target(
          'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}',
163
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
164 165 166 167 168 169 170
        ));

      local maxShards =
        graphPanel.new(
          'Max Shards',
          datasource='$datasource',
          span=4,
beorn7 committed
171
        )
172 173
        .addTarget(prometheus.target(
          'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}',
174
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
175 176 177 178 179 180 181 182
        ));

      local minShards =
        graphPanel.new(
          'Min Shards',
          datasource='$datasource',
          span=4,
        )
183 184
        .addTarget(prometheus.target(
          'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}',
185
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
186 187 188 189 190 191 192 193
        ));

      local desiredShards =
        graphPanel.new(
          'Desired Shards',
          datasource='$datasource',
          span=4,
        )
194 195
        .addTarget(prometheus.target(
          'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}',
196
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
197
        ));
198 199 200

      local shardsCapacity =
        graphPanel.new(
201
          'Shard Capacity',
202 203
          datasource='$datasource',
          span=6,
204
        )
205 206
        .addTarget(prometheus.target(
          'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}',
207
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
208
        ));
209 210


211 212
      local pendingSamples =
        graphPanel.new(
213
          'Pending Samples',
214 215
          datasource='$datasource',
          span=6,
216
        )
217 218
        .addTarget(prometheus.target(
          'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"}',
219
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
220 221
        ));

222
      local walSegment =
223 224 225 226 227
        graphPanel.new(
          'TSDB Current Segment',
          datasource='$datasource',
          span=6,
          formatY1='none',
228
        )
229 230 231 232 233
        .addTarget(prometheus.target(
          'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}',
          legendFormat='{{cluster}}:{{instance}}'
        ));

234
      local queueSegment =
235 236 237 238 239
        graphPanel.new(
          'Remote Write Current Segment',
          datasource='$datasource',
          span=6,
          formatY1='none',
240
        )
241 242
        .addTarget(prometheus.target(
          'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}',
243
          legendFormat='{{cluster}}:{{instance}} {{consumer}}'
244 245 246 247 248 249 250
        ));

      local droppedSamples =
        graphPanel.new(
          'Dropped Samples',
          datasource='$datasource',
          span=3,
251
        )
252 253
        .addTarget(prometheus.target(
          'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
254
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
255 256 257 258 259 260 261 262 263 264
        ));

      local failedSamples =
        graphPanel.new(
          'Failed Samples',
          datasource='$datasource',
          span=3,
        )
        .addTarget(prometheus.target(
          'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
265
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
266 267 268 269 270 271 272
        ));

      local retriedSamples =
        graphPanel.new(
          'Retried Samples',
          datasource='$datasource',
          span=3,
beorn7 committed
273
        )
274 275
        .addTarget(prometheus.target(
          'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
276
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
277 278 279 280 281 282 283 284 285 286
        ));

      local enqueueRetries =
        graphPanel.new(
          'Enqueue Retries',
          datasource='$datasource',
          span=3,
        )
        .addTarget(prometheus.target(
          'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
287
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
288 289 290
        ));

      dashboard.new('Prometheus Remote Write',
291
                    editable=true)
292 293 294 295 296 297 298 299 300 301 302
      .addTemplate(
        {
          hide: 0,
          label: null,
          name: 'datasource',
          options: [],
          query: 'prometheus',
          refresh: 1,
          regex: '',
          type: 'datasource',
        },
beorn7 committed
303
      )
304 305 306 307 308 309 310 311 312 313 314
      .addTemplate(
        template.new(
          'instance',
          '$datasource',
          'label_values(prometheus_build_info, instance)' % $._config,
          refresh='time',
          current={
            selected: true,
            text: 'All',
            value: '$__all',
          },
315
          includeAll=true,
beorn7 committed
316
        )
317 318 319 320 321 322 323 324 325 326 327 328
      )
      .addTemplate(
        template.new(
          'cluster',
          '$datasource',
          'label_values(kube_pod_container_info{image=~".*prometheus.*"}, cluster)' % $._config,
          refresh='time',
          current={
            selected: true,
            text: 'All',
            value: '$__all',
          },
329
          includeAll=true,
beorn7 committed
330
        )
331 332 333
      )
      .addTemplate(
        template.new(
334
          'url',
335
          '$datasource',
336
          'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config,
337
          refresh='time',
338
          includeAll=true,
beorn7 committed
339
        )
340 341 342 343 344 345 346 347 348 349 350
      )
      .addRow(
        row.new('Timestamps')
        .addPanel(timestampComparison)
        .addPanel(timestampComparisonRate)
      )
      .addRow(
        row.new('Samples')
        .addPanel(samplesRate)
      )
      .addRow(
351 352
        row.new(
          'Shards'
beorn7 committed
353
        )
354 355 356 357
        .addPanel(currentShards)
        .addPanel(maxShards)
        .addPanel(minShards)
        .addPanel(desiredShards)
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
      )
      .addRow(
        row.new('Shard Details')
        .addPanel(shardsCapacity)
        .addPanel(pendingSamples)
      )
      .addRow(
        row.new('Segments')
        .addPanel(walSegment)
        .addPanel(queueSegment)
      )
      .addRow(
        row.new('Misc. Rates')
        .addPanel(droppedSamples)
        .addPanel(failedSamples)
        .addPanel(retriedSamples)
        .addPanel(enqueueRetries)
375
      ),
beorn7 committed
376
  },
Tom Wilkie committed
377
}