diff --git a/.gitignore b/.gitignore index 85ec053..80fbd3a 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,8 @@ out/ .DS_Store .env + +monitor/prometheus/volume/* +monitor/grafana/alerting/* +monitor/grafana/volume/* +monitor/grafana/*.db diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml new file mode 100644 index 0000000..80eb1a8 --- /dev/null +++ b/docker-compose.monitoring.yml @@ -0,0 +1,69 @@ +version: '3' +services: + spring-app: + container_name: spring-app + build: . + restart: unless-stopped + ports: + - "8080:8080" + environment: + - SPRING_DATASOURCE_URL=jdbc:mysql://${DB_URL} + - SPRING_DATASOURCE_USERNAME=${DB_USERNAME} + - SPRING_DATASOURCE_PASSWORD=${DB_PASSWORD} + networks: + - network + + node_exporter: + container_name: node_exporter + image: prom/node-exporter:latest + command: + - '--path.rootfs=/host' + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + ports: + - "9100:9100" + networks: + - network + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + + prometheus: + container_name: prometheus + image: prom/prometheus:latest + user: root + ports: + - "9090:9090" + volumes: + - ./monitor/prometheus/config/:/etc/prometheus/ + - ./monitor/prometheus/volume:/prometheus + command: + - '--web.enable-lifecycle' + - '--config.file=/etc/prometheus/prometheus.yml' + restart: always + depends_on: + - spring-app + - node_exporter + networks: + - network + + grafana: + container_name: grafana + image: grafana/grafana:latest + ports: + - "3000:3000" + depends_on: + - prometheus + volumes: + - ./monitor/grafana:/var/lib/grafana + restart: always + networks: + - network + +networks: + network: + driver: bridge diff --git a/monitor/prometheus/config/prometheus.yml b/monitor/prometheus/config/prometheus.yml new file mode 100644 index 0000000..d76f213 --- /dev/null +++ b/monitor/prometheus/config/prometheus.yml @@ -0,0 +1,33 @@ +global: + scrape_interval: 15s # scrap target의 기본 interval을 15초로 변경 / default = 1m + scrape_timeout: 15s # scrap request 가 timeout 나는 길이 / default = 10s + evaluation_interval: 2m # rule 을 얼마나 빈번하게 검증하는지 / default = 1m + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor동: 'codelab-monitor' # 기본적으로 붙여줄 라벨 + query_log_file: query_log_file.log # prometheus의 쿼리 로그들을 기록, 없으면 기록안함 + +rule_files: + - "rule.yml" # 파일 위치는 prometheus.yml 이 있는 곳과 동일 위치 + +scrape_configs: + - job_name: 'node_exporter' # job_name 은 모든 scrap 내에서 고유해야함 + scrape_interval: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨 + scrape_timeout: 10s # global에서 default 값을 정의해주었기 떄문에 안써도됨 + metrics_path: '/metrics' # 옵션 - prometheus가 metrics를 얻기위해 참조하는 URI를 변경할 수 있음 | default = /metrics + honor_labels: false # 옵션 - 라벨 충동이 있을경우 라벨을 변경할지설정(false일 경우 라벨 안바뀜) | default = false + honor_timestamps: false # 옵션 - honor_labels이 참일 경우, metrics timestamp가 노출됨(true일 경우) | default = false + scheme: 'http' # 옵션 - request를 보낼 scheme 설정 | default = http + + static_configs: + - targets: ['host.docker.internal:9100'] ## prometheus가 scrap할 대상을 설정 + labels: # 옵션 - scrap 해서 가져올 metrics 들 전부에게 붙여줄 라벨 + service : 'server-1' + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + labels: + service: 'prometheus' \ No newline at end of file diff --git a/monitor/prometheus/config/rule.yml b/monitor/prometheus/config/rule.yml new file mode 100644 index 0000000..e385806 --- /dev/null +++ b/monitor/prometheus/config/rule.yml @@ -0,0 +1,20 @@ +groups: + - name: example + rules: + # Alert for any instance that is unreachable for >5 minutes. + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + + # Alert for any instance that has a median request latency >1s. + - alert: APIHighRequestLatency + expr: api_http_request_latencies_second{quantile="0.5"} > 1 + for: 10m + annotations: + summary: "High request latency on {{ $labels.instance }}" + description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)" \ No newline at end of file