Skip to content

Commit

Permalink
update site at 20240408-211317, machine LiaoSirui-MacMini.local
Browse files Browse the repository at this point in the history
  • Loading branch information
LiaoSirui committed Apr 8, 2024
1 parent 9e4e235 commit 0056705
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 15 deletions.
118 changes: 118 additions & 0 deletions 监控/PrometheusStack/采集客户端/node-exporter/常用告警.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
```yaml
[root@gtcq-gt-monitor-prometheus-01 rules]# more gt-dwz-monitor.rules
groups:
- name: dwz-gt-monitor
rules:
- alert: "node-Agent告警"
expr: up{job="gt-dwz-node-exporter"} == 0
for: 120s
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "Agent告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ $labels.instance }} 已停止采集监控数据 30s!"
description: "{{ $labels.instance }} job {{ $labels.job }} 暴露监控数据已停止."

- alert: "CPU使用率监控"
expr: ceil(100 - sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter",mode="idle"}[5m])) by(instance) / sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter"}[5m]))
by(instance)*100) > 80
for: 2m
labels:
severity: "重要"
team: bdfb
alert_type: "CPU告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU使用率过高"
description: "IP:{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的CPU使用大于80% (当前值: {{ $value }})"

- alert: "磁盘使用率监控"
expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"
})) * 100) > 80
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "Disk告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }}%)"

- alert: "内存使用率监控"
expr: ceil( (1 - (node_memory_MemAvailable_bytes{job="gt-dwz-node-exporter"} / (node_memory_MemTotal_bytes{job="gt-dwz-node-exporter"})))* 100 ) > 80
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "MEM告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用率过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用大于80% (当前值: {{ $value }})"

- alert: "服务器大法宝CPULoad5"
expr: node_load5{job="gt-dwz-node-exporter"} > 100
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "负载告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}CPU负载过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU负载load大于100 (当前值: {{ $value }})"

- alert: "服务器文件句柄监控"
expr: node_filefd_allocated{job="gt-dwz-node-exporter"} > 50000
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "文件句柄告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高大于50000 (当前值: {{ $value }})"

- alert: "服务器TCP连接数监控"
expr: node_sockstat_TCP_tw{job="gt-dwz-node-exporter"} > 15000
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "TCP连接数告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数TIME_WAIT过高大于15000 (当前值: {{ $value }})"

- alert: "服务器入口流量监控"
expr: round((sum by (instance) (irate(node_network_receive_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "流量告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}监控入口流量过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控入口流量过高过高大于50MB (告警值: {{ $value }}MB)"

- alert: "服务器出口流量监控"
expr: round((sum by (instance) (irate(node_network_transmit_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "流量告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高过高大于50MB (告警值: {{ $value }}MB)"
[root@gtcq-gt-monitor-prometheus-01 rules]#

```

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@

## 配置字段和状态

## 添加自定义问题类型
## 添加自定义问题类型

Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
- 条件 – 检查用户是否应该执行转换
- 验证程序 – 在执行转换之前,检查转换的任何输入(例如,用户的输入)是否有效
- 后期功能 – 执行转换之后,执行其他处理操作
- 属性 – 可用于进一步自定义转换的键值对
- 属性 – 可用于进一步自定义转换的键值对

需要针对不同的Issue类型制定不同的工作流,也就是绘制工作流状态转换图 。方框代表状态,箭头代表转换动作,其中黑色方框代表初始态,蓝色方框代表过程态,绿色方框代表终结态
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
## 上架机器流程

提交工单的基础信息:

- 需要开通的网络关系
- 是否申请 sudo 权限
- 机器连接方式:jumpserver 或者 ssh
- 是否需要测试

```mermaid
sequenceDiagram
actor OPSM as 运维负责人
actor OPS as 上架负责人
actor OPSMK as 监控工程师(k8s)
actor OPSJ as JumpServer负责人
actor OPSN as 网络工程师
participant E as 流程结束
OPSM->>OPSJ: 添加远程
OPSJ->>OPSM: 返回远程信息并分配权限
OPSM->>OPS: 提交申请并手动指派上架人员
OPS->>OPS: 进行上架处理步骤,详见上架文档
OPS->>OPSMK: 添加监控
OPSMK->>OPS: 返回监控配置并等待确认
OPS->>OPSN: 提交网络配置信息
OPSN->>OPS: 返回上架网络信息并等待确认
OPS->>OPSM: 返回上架信息并等待确认
OPS->>E: 确认后结束流程
```



## 下架机器流程

```mermaid
sequenceDiagram
actor REQ as 申请人
actor OPSM as 运维负责人
actor OPSN as 网络工程师
actor OPSN as 网络工程师
participant E as 流程结束
```

This file was deleted.

8 changes: 7 additions & 1 deletion 运维研发/项目管理与团队协作/Jira/Jira简介.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Jira 是 Atlassian 公司出品的项目与事务跟踪工具,被广泛应用
- 界面(提交的部分)
- 字段(记录问题信息)

![image-20240408112825678](./.assets/Jira简介/image-20240408112825678.png)

![img](./.assets/Jira简介/fields_diagram.png)

通过自定义字段、界面和方案,可以充分利用 JIRA 系统的全部功能,并确保 Jira 的用户高效地工作
Expand All @@ -40,4 +42,8 @@ Jira 是 Atlassian 公司出品的项目与事务跟踪工具,被广泛应用

- Jira 使用:<https://www.yiibai.com/jira/jira-introduction.html>

- <https://doc.devpod.cn/jira/jira-15237264.html>
- <https://doc.devpod.cn/jira/jira-15237264.html>

- <https://blog.csdn.net/Nicolege678/article/details/124605511>

- <https://blog.csdn.net/qq_41386332/article/details/108658431>

0 comments on commit 0056705

Please sign in to comment.