-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update site at 20240408-211317, machine LiaoSirui-MacMini.local
- Loading branch information
Showing
8 changed files
with
173 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
```yaml | ||
[root@gtcq-gt-monitor-prometheus-01 rules]# more gt-dwz-monitor.rules | ||
groups: | ||
- name: dwz-gt-monitor | ||
rules: | ||
- alert: "node-Agent告警" | ||
expr: up{job="gt-dwz-node-exporter"} == 0 | ||
for: 120s | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "Agent告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ $labels.instance }} 已停止采集监控数据 30s!" | ||
description: "{{ $labels.instance }} job {{ $labels.job }} 暴露监控数据已停止." | ||
|
||
- alert: "CPU使用率监控" | ||
expr: ceil(100 - sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter",mode="idle"}[5m])) by(instance) / sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter"}[5m])) | ||
by(instance)*100) > 80 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: bdfb | ||
alert_type: "CPU告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU使用率过高" | ||
description: "IP:{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的CPU使用大于80% (当前值: {{ $value }})" | ||
|
||
- alert: "磁盘使用率监控" | ||
expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter" | ||
})) * 100) > 80 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "Disk告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }}%)" | ||
|
||
- alert: "内存使用率监控" | ||
expr: ceil( (1 - (node_memory_MemAvailable_bytes{job="gt-dwz-node-exporter"} / (node_memory_MemTotal_bytes{job="gt-dwz-node-exporter"})))* 100 ) > 80 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "MEM告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用率过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用大于80% (当前值: {{ $value }})" | ||
|
||
- alert: "服务器大法宝CPULoad5" | ||
expr: node_load5{job="gt-dwz-node-exporter"} > 100 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "负载告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}CPU负载过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU负载load大于100 (当前值: {{ $value }})" | ||
|
||
- alert: "服务器文件句柄监控" | ||
expr: node_filefd_allocated{job="gt-dwz-node-exporter"} > 50000 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "文件句柄告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高大于50000 (当前值: {{ $value }})" | ||
|
||
- alert: "服务器TCP连接数监控" | ||
expr: node_sockstat_TCP_tw{job="gt-dwz-node-exporter"} > 15000 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "TCP连接数告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数TIME_WAIT过高大于15000 (当前值: {{ $value }})" | ||
|
||
- alert: "服务器入口流量监控" | ||
expr: round((sum by (instance) (irate(node_network_receive_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "流量告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}监控入口流量过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控入口流量过高过高大于50MB (告警值: {{ $value }}MB)" | ||
|
||
- alert: "服务器出口流量监控" | ||
expr: round((sum by (instance) (irate(node_network_transmit_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50 | ||
for: 2m | ||
labels: | ||
severity: "重要" | ||
team: dwz-gt-monitor | ||
alert_type: "流量告警" | ||
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}" | ||
annotations: | ||
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高" | ||
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高过高大于50MB (告警值: {{ $value }}MB)" | ||
[root@gtcq-gt-monitor-prometheus-01 rules]# | ||
|
||
``` | ||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,5 @@ | |
|
||
## 配置字段和状态 | ||
|
||
## 添加自定义问题类型 | ||
## 添加自定义问题类型 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
## 上架机器流程 | ||
|
||
提交工单的基础信息: | ||
|
||
- 需要开通的网络关系 | ||
- 是否申请 sudo 权限 | ||
- 机器连接方式:jumpserver 或者 ssh | ||
- 是否需要测试 | ||
|
||
```mermaid | ||
sequenceDiagram | ||
actor OPSM as 运维负责人 | ||
actor OPS as 上架负责人 | ||
actor OPSMK as 监控工程师(k8s) | ||
actor OPSJ as JumpServer负责人 | ||
actor OPSN as 网络工程师 | ||
participant E as 流程结束 | ||
OPSM->>OPSJ: 添加远程 | ||
OPSJ->>OPSM: 返回远程信息并分配权限 | ||
OPSM->>OPS: 提交申请并手动指派上架人员 | ||
OPS->>OPS: 进行上架处理步骤,详见上架文档 | ||
OPS->>OPSMK: 添加监控 | ||
OPSMK->>OPS: 返回监控配置并等待确认 | ||
OPS->>OPSN: 提交网络配置信息 | ||
OPSN->>OPS: 返回上架网络信息并等待确认 | ||
OPS->>OPSM: 返回上架信息并等待确认 | ||
OPS->>E: 确认后结束流程 | ||
``` | ||
|
||
|
||
|
||
## 下架机器流程 | ||
|
||
```mermaid | ||
sequenceDiagram | ||
actor REQ as 申请人 | ||
actor OPSM as 运维负责人 | ||
actor OPSN as 网络工程师 | ||
actor OPSN as 网络工程师 | ||
participant E as 流程结束 | ||
``` | ||
|
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters