repos / ops

infra for pico services
git clone https://github.com/picosh/ops.git

commit
a1c9db1
parent
20da7ed
author
Antonio Mika
date
2023-02-24 21:29:56 +0000 UTC
Added alerting methods to provisioned resources
8 files changed,  +510, -2
M monitoring/.env.example
+2, -1
1@@ -1,3 +1,4 @@
2 APP_DOMAIN=dev.pico.sh
3 APP_EMAIL=hello@pico.sh
4-CF_API_TOKEN=secret
5+CF_API_TOKEN=secret
6+NOTIFY_WEBHOOK=secret
M monitoring/grafana/config/grafana.ini
+0, -1
1@@ -9,4 +9,3 @@ mode = console
2 admin_user = antonio
3 [server]
4 root_url = https://grafana.pico.sh/
5-
A monitoring/grafana/config/provisioning/alerting/contactpoints.yml
+12, -0
 1@@ -0,0 +1,12 @@
 2+apiVersion: 1
 3+
 4+contactPoints:
 5+  - orgId: 1
 6+    name: Notifico
 7+    receivers:
 8+      - uid: J9QprMbVz
 9+        type: webhook
10+        settings:
11+          url: $NOTIFY_WEBHOOK
12+          httpMethod: POST
13+          maxAlerts: '0'
A monitoring/grafana/config/provisioning/alerting/groups.yml
+193, -0
  1@@ -0,0 +1,193 @@
  2+apiVersion: 1
  3+
  4+groups:
  5+  - orgId: 1
  6+    name: HTTPS
  7+    folder: Default
  8+    interval: 60s
  9+    rules:
 10+      - id: 1
 11+        uid: 7pOZR7x4z
 12+        orgID: 1
 13+        folderUID: wLZWgnbVz
 14+        ruleGroup: HTTPS
 15+        title: Certificate Expiration
 16+        condition: C
 17+        data:
 18+        - refId: A
 19+          queryType: ''
 20+          relativeTimeRange:
 21+            from: 600
 22+            to: 0
 23+          datasourceUid: PBFA97CFB590B2093
 24+          model:
 25+            editorMode: code
 26+            expr: floor((probe_ssl_earliest_cert_expiry - time()) / (3600 * 24))
 27+            hide: false
 28+            intervalMs: 1000
 29+            legendFormat: __auto
 30+            maxDataPoints: 43200
 31+            range: true
 32+            refId: A
 33+        - refId: B
 34+          queryType: ''
 35+          relativeTimeRange:
 36+            from: 600
 37+            to: 0
 38+          datasourceUid: "-100"
 39+          model:
 40+            conditions:
 41+            - evaluator:
 42+                params:
 43+                - 0
 44+                - 0
 45+                type: gt
 46+              operator:
 47+                type: and
 48+              query:
 49+                params: []
 50+              reducer:
 51+                params: []
 52+                type: avg
 53+              type: query
 54+            datasource:
 55+              name: Expression
 56+              type: __expr__
 57+              uid: __expr__
 58+            expression: A
 59+            intervalMs: 1000
 60+            maxDataPoints: 43200
 61+            reducer: last
 62+            refId: B
 63+            settings:
 64+              mode: dropNN
 65+            type: reduce
 66+        - refId: C
 67+          queryType: ''
 68+          relativeTimeRange:
 69+            from: 600
 70+            to: 0
 71+          datasourceUid: "-100"
 72+          model:
 73+            conditions:
 74+            - evaluator:
 75+                params:
 76+                - 30
 77+                - 0
 78+                type: lt
 79+              operator:
 80+                type: and
 81+              query:
 82+                params: []
 83+              reducer:
 84+                params: []
 85+                type: avg
 86+              type: query
 87+            datasource:
 88+              name: Expression
 89+              type: __expr__
 90+              uid: __expr__
 91+            expression: B
 92+            intervalMs: 1000
 93+            maxDataPoints: 43200
 94+            refId: C
 95+            type: threshold
 96+        updated: '2023-02-24T20:29:12Z'
 97+        noDataState: Alerting
 98+        execErrState: Alerting
 99+        for: 1m
100+        annotations:
101+          description: The certificate for {{ $labels.instance }} will expire in {{ $values.B
102+            }} days
103+          summary: The certificate for {{ $labels.instance }} will expire in {{ $values.B
104+            }} days
105+      - id: 3
106+        uid: XqLjDnx4k
107+        orgID: 1
108+        folderUID: wLZWgnbVz
109+        ruleGroup: HTTPS
110+        title: HTTPS Status
111+        condition: C
112+        data:
113+        - refId: A
114+          queryType: ''
115+          relativeTimeRange:
116+            from: 600
117+            to: 0
118+          datasourceUid: PBFA97CFB590B2093
119+          model:
120+            editorMode: code
121+            expr: probe_success{job=~"blackbox_exporter_https_.*"}
122+            hide: false
123+            intervalMs: 1000
124+            legendFormat: __auto
125+            maxDataPoints: 43200
126+            range: true
127+            refId: A
128+        - refId: B
129+          queryType: ''
130+          relativeTimeRange:
131+            from: 600
132+            to: 0
133+          datasourceUid: "-100"
134+          model:
135+            conditions:
136+            - evaluator:
137+                params: []
138+                type: gt
139+              operator:
140+                type: and
141+              query:
142+                params:
143+                - B
144+              reducer:
145+                params: []
146+                type: last
147+              type: query
148+            datasource:
149+              type: __expr__
150+              uid: "-100"
151+            expression: A
152+            hide: false
153+            intervalMs: 1000
154+            maxDataPoints: 43200
155+            reducer: last
156+            refId: B
157+            type: reduce
158+        - refId: C
159+          queryType: ''
160+          relativeTimeRange:
161+            from: 600
162+            to: 0
163+          datasourceUid: "-100"
164+          model:
165+            conditions:
166+            - evaluator:
167+                params:
168+                - 1
169+                type: lt
170+              operator:
171+                type: and
172+              query:
173+                params:
174+                - C
175+              reducer:
176+                params: []
177+                type: last
178+              type: query
179+            datasource:
180+              type: __expr__
181+              uid: "-100"
182+            expression: B
183+            hide: false
184+            intervalMs: 1000
185+            maxDataPoints: 43200
186+            refId: C
187+            type: threshold
188+        updated: '2023-02-24T20:29:12Z'
189+        noDataState: NoData
190+        execErrState: Error
191+        for: 1m
192+        annotations:
193+          description: HTTPS is inaccessible for {{ $labels.instance }}
194+          summary: HTTPS is inaccessible for {{ $labels.instance }}
A monitoring/grafana/config/provisioning/alerting/http.yml
+98, -0
 1@@ -0,0 +1,98 @@
 2+apiVersion: 1
 3+
 4+groups:
 5+  - orgId: 1
 6+    name: HTTP
 7+    folder: Default
 8+    interval: 60s
 9+    rules:
10+      - id: 5
11+        uid: LKZ1d7bVk
12+        orgID: 1
13+        folderUID: wLZWgnbVz
14+        ruleGroup: HTTP
15+        title: HTTP Redirect
16+        condition: C
17+        data:
18+        - refId: A
19+          queryType: ''
20+          relativeTimeRange:
21+            from: 600
22+            to: 0
23+          datasourceUid: PBFA97CFB590B2093
24+          model:
25+            editorMode: code
26+            expr: probe_success{job="blackbox_exporter_http_3xx"}
27+            hide: false
28+            intervalMs: 1000
29+            legendFormat: __auto
30+            maxDataPoints: 43200
31+            range: true
32+            refId: A
33+        - refId: B
34+          queryType: ''
35+          relativeTimeRange:
36+            from: 600
37+            to: 0
38+          datasourceUid: "-100"
39+          model:
40+            conditions:
41+            - evaluator:
42+                params: []
43+                type: gt
44+              operator:
45+                type: and
46+              query:
47+                params:
48+                - B
49+              reducer:
50+                params: []
51+                type: last
52+              type: query
53+            datasource:
54+              type: __expr__
55+              uid: "-100"
56+            expression: A
57+            hide: false
58+            intervalMs: 1000
59+            maxDataPoints: 43200
60+            reducer: last
61+            refId: B
62+            type: reduce
63+        - refId: C
64+          queryType: ''
65+          relativeTimeRange:
66+            from: 600
67+            to: 0
68+          datasourceUid: "-100"
69+          model:
70+            conditions:
71+            - evaluator:
72+                params:
73+                - 1
74+                type: lt
75+              operator:
76+                type: and
77+              query:
78+                params:
79+                - C
80+              reducer:
81+                params: []
82+                type: last
83+              type: query
84+            datasource:
85+              type: __expr__
86+              uid: "-100"
87+            expression: B
88+            hide: false
89+            intervalMs: 1000
90+            maxDataPoints: 43200
91+            refId: C
92+            type: threshold
93+        updated: '2023-02-24T20:38:48Z'
94+        noDataState: NoData
95+        execErrState: Error
96+        for: 1m
97+        annotations:
98+          description: HTTP redirect is invalid for {{ $labels.instance }}
99+          summary: HTTP redirect is invalid for {{ $labels.instance }}
A monitoring/grafana/config/provisioning/alerting/notificationpolicies.yml
+9, -0
 1@@ -0,0 +1,9 @@
 2+apiVersion: 1
 3+
 4+policies:
 5+  - orgId: 1
 6+    receiver: Notifico
 7+    group_by: ["alertname"]
 8+    group_wait: 30s
 9+    group_interval: 5m
10+    repeat_interval: 4h
A monitoring/grafana/config/provisioning/alerting/ping.yml
+98, -0
 1@@ -0,0 +1,98 @@
 2+apiVersion: 1
 3+
 4+groups:
 5+  - orgId: 1
 6+    name: Ping
 7+    folder: Default
 8+    interval: 60s
 9+    rules:
10+      - id: 4
11+        uid: ZbqmOnbVz
12+        orgID: 1
13+        folderUID: wLZWgnbVz
14+        ruleGroup: Ping
15+        title: Ping Status
16+        condition: C
17+        data:
18+        - refId: A
19+          queryType: ''
20+          relativeTimeRange:
21+            from: 600
22+            to: 0
23+          datasourceUid: PBFA97CFB590B2093
24+          model:
25+            editorMode: code
26+            expr: probe_success{job="blackbox_exporter_ping"}
27+            hide: false
28+            intervalMs: 1000
29+            legendFormat: __auto
30+            maxDataPoints: 43200
31+            range: true
32+            refId: A
33+        - refId: B
34+          queryType: ''
35+          relativeTimeRange:
36+            from: 600
37+            to: 0
38+          datasourceUid: "-100"
39+          model:
40+            conditions:
41+            - evaluator:
42+                params: []
43+                type: gt
44+              operator:
45+                type: and
46+              query:
47+                params:
48+                - B
49+              reducer:
50+                params: []
51+                type: last
52+              type: query
53+            datasource:
54+              type: __expr__
55+              uid: "-100"
56+            expression: A
57+            hide: false
58+            intervalMs: 1000
59+            maxDataPoints: 43200
60+            reducer: last
61+            refId: B
62+            type: reduce
63+        - refId: C
64+          queryType: ''
65+          relativeTimeRange:
66+            from: 600
67+            to: 0
68+          datasourceUid: "-100"
69+          model:
70+            conditions:
71+            - evaluator:
72+                params:
73+                - 1
74+                type: lt
75+              operator:
76+                type: and
77+              query:
78+                params:
79+                - C
80+              reducer:
81+                params: []
82+                type: last
83+              type: query
84+            datasource:
85+              type: __expr__
86+              uid: "-100"
87+            expression: B
88+            hide: false
89+            intervalMs: 1000
90+            maxDataPoints: 43200
91+            refId: C
92+            type: threshold
93+        updated: '2023-02-24T20:30:57Z'
94+        noDataState: NoData
95+        execErrState: Error
96+        for: 1m
97+        annotations:
98+          description: Ping is inaccessible for {{ $labels.instance }}
99+          summary: Ping is inaccessible for {{ $labels.instance }}
A monitoring/grafana/config/provisioning/alerting/ssh.yml
+98, -0
 1@@ -0,0 +1,98 @@
 2+apiVersion: 1
 3+
 4+groups:
 5+  - orgId: 1
 6+    name: SSH
 7+    folder: Default
 8+    interval: 60s
 9+    rules:
10+      - id: 2
11+        uid: dOT8vnbVz
12+        orgID: 1
13+        folderUID: wLZWgnbVz
14+        ruleGroup: SSH
15+        title: SSH Status
16+        condition: C
17+        data:
18+        - refId: A
19+          queryType: ''
20+          relativeTimeRange:
21+            from: 600
22+            to: 0
23+          datasourceUid: PBFA97CFB590B2093
24+          model:
25+            editorMode: code
26+            expr: probe_success{job="blackbox_exporter_ssh"}
27+            hide: false
28+            intervalMs: 1000
29+            legendFormat: __auto
30+            maxDataPoints: 43200
31+            range: true
32+            refId: A
33+        - refId: B
34+          queryType: ''
35+          relativeTimeRange:
36+            from: 600
37+            to: 0
38+          datasourceUid: "-100"
39+          model:
40+            conditions:
41+            - evaluator:
42+                params: []
43+                type: gt
44+              operator:
45+                type: and
46+              query:
47+                params:
48+                - B
49+              reducer:
50+                params: []
51+                type: last
52+              type: query
53+            datasource:
54+              type: __expr__
55+              uid: "-100"
56+            expression: A
57+            hide: false
58+            intervalMs: 1000
59+            maxDataPoints: 43200
60+            reducer: last
61+            refId: B
62+            type: reduce
63+        - refId: C
64+          queryType: ''
65+          relativeTimeRange:
66+            from: 600
67+            to: 0
68+          datasourceUid: "-100"
69+          model:
70+            conditions:
71+            - evaluator:
72+                params:
73+                - 1
74+                - 0
75+                type: lt
76+              operator:
77+                type: and
78+              query:
79+                params: []
80+              reducer:
81+                params: []
82+                type: avg
83+              type: query
84+            datasource:
85+              name: Expression
86+              type: __expr__
87+              uid: __expr__
88+            expression: B
89+            intervalMs: 1000
90+            maxDataPoints: 43200
91+            refId: C
92+            type: threshold
93+        updated: '2023-02-24T20:26:24Z'
94+        noDataState: NoData
95+        execErrState: Error
96+        for: 1m
97+        annotations:
98+          description: SSH is inaccessible for {{ $labels.instance }}
99+          summary: SSH is inaccessible for {{ $labels.instance }}