Skip to content

Commit

Permalink
Merge pull request #1511 from mitodl/retire_fluentd_phase_1a
Browse files Browse the repository at this point in the history
Installing and configuring vector
  • Loading branch information
Ardiea authored Mar 3, 2022
2 parents f2ff6f2 + 86041c4 commit b9123c4
Show file tree
Hide file tree
Showing 20 changed files with 671 additions and 760 deletions.
11 changes: 5 additions & 6 deletions pillar/top.sls
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ base:
- match: compound
- common
- environment_settings
# - vector
- vector
# '* and not proxy-* and not restore-* and not G@roles:devstack and not P@environment:mitxonline and not G@context:packer and not P@roles:(edx|edx-worker)$':
# - match: compound
# - fluentd
Expand All @@ -15,7 +15,6 @@ base:
- elastic_stack.version_production
'roles:auth_server':
- match: grain
- vector.cas
'G@roles:elasticsearch and not P@environment:operations*':
- match: compound
- consul
Expand Down Expand Up @@ -82,8 +81,8 @@ base:
- consul
- shibboleth
- shibboleth.odlvideo
- fluentd.odlvideo
- logrotate.odlvideo
- vector.odlvideo
proxy-bootcamps-*:
- heroku.bootcamps
proxy-mitxpro-*:
Expand All @@ -102,7 +101,7 @@ base:
- consul
- shibboleth
- shibboleth.mitx_cas
- fluentd.cas
- vector.cas
'G@roles:rabbitmq and P@environment:mitx.*':
- match: compound
- rabbitmq.mitx
Expand All @@ -114,8 +113,8 @@ base:
- match: grain
- nginx
- nginx.reddit
- reddit
- vector.reddit
- reddit
'G@environment:operations and G@roles:redash':
- match: compound
- nginx
Expand Down Expand Up @@ -217,8 +216,8 @@ base:
'roles:rabbitmq':
- match: grain
- rabbitmq
- vector.rabbitmq
- consul.rabbitmq
- vector.rabbitmq
'roles:tika':
- match: grain
- nginx
Expand Down
153 changes: 9 additions & 144 deletions pillar/vector/cas.sls
Original file line number Diff line number Diff line change
@@ -1,145 +1,10 @@
vector:
extra_configurations:
- name: cas_logs
content:
log_schema:
timestamp_key: vector_timestamp
host_key: log_host
sources:
collect_cas_nginx_access_logs:
type: file
read_from: end
file_key: log_file
include:
- /var/log/nginx/access.log
collect_cas_nginx_error_logs:
type: file
read_from: end
file_key: log_file
include:
- /var/log/nginx/error.log
collect_cas_application_logs:
type: file
read_from: end
file_key: log_file
include:
- /opt/log/django.log
multiline:
start_pattern: '^\['
condition_pattern: '^\['
mode: 'halt_before'
timeout_ms: 5000
collect_auth_logs:
{{ salt.pillar.get('vector:base_auth_log_collection')|yaml(False)|indent(8) }}
transforms:
# Transforms for NGINX logs
parse_cas_nginx_access_logs:
type: remap
inputs:
- 'collect_cas_nginx_access_logs'
source: |
parsed, err = parse_regex(.message, r'^time=(?P<time>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2})\sclient=(?P<client>[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})\smethod=(?P<method>\S*)\srequest="(?P<request>.*)"\srequest_length=(?P<request_length>\d+)\sstatus=(?P<status>\d+)\sbytes_sent=(?P<bytes_sent>\d+)\sbody_bytes_sent=(?P<body_bytes_sent>\d+)\sreferer=(?P<referer>.*)\suser_agent="(?P<user_agent>.+)"\supstream_addr=(?P<upstream_addr>.+)\supstream_status=(?P<upstream_status>.+)\srequest_time=(?P<request_time>.+)\srequest_id=(?P<request_id>\w+)\supstream_response_time=(?P<upstream_response_time>.+)\supstream_connect_time=(?P<upstream_connect_time>.+)\supstream_header_time=(?P<upstream_header_time>.*)$')
if err != null {
.parse_error = err
}
err = null
. = merge(., parsed)
.log_process = "nginx"
.log_type = "cas.nginx.access"
.environment = "${ENVIRONMENT}"
parsed_bs, err = to_int(.bytes_sent)
if err == null {
.bytes_sent = parsed_bs
}
err = null
parsed_bbs, err = to_int(.body_bytes_sent)
if err == null {
.body_bytes_sent = parsed_bbs
}
err = null
parsed_rl, err = to_int(.request_length)
if err == null {
.request_length = parsed_rl
}
err = null
parsed_rt, err = to_float(.request_time)
if err == null {
.request_time = parsed_rt
}
err = null
parsed_status, err = to_int(.status)
if err == null {
.status = parsed_status
}
err = null
parsed_usct, err = to_float(.upstream_connect_time)
if err == null {
.upstream_connect_time = parsed_usct
}
err = null
parsed_usht, err = to_float(.upstream_header_time)
if err == null {
.upstream_header_time = parsed_usht
}
err = null
parsed_uprt, err = to_float(.upstream_response_time)
if err == null {
.upstream_response_time = parsed_uprt
}
err = null
parsed_ups, err = to_int(.upstream_response)
if err == null {
.upstream_status = parsed_ups
}
err = null
filter_healthchecks_cas_nginx_access_logs:
inputs:
- 'parse_cas_nginx_access_logs'
type: filter
condition: '! contains!(.http_user_agent, "ELB-HealthChecker")'
parse_cas_nginx_error_logs:
type: remap
inputs:
- 'collect_cas_nginx_error_logs'
source: |
parsed, err = parse_regex(.message, r'^(?P<time>\d{4}/\d{2}/\d{2}\s\d{2}:\d{2}:\d{2})\s\[(?P<severity>.*)\]\s(?P<pid>\d*)#(?P<tid>\d*):\s\*(?P<cid>\d*)\s(?P<message>.*),\sclient:\s(?P<client>.*),\sserver:(?P<server>.*)(?P<additional_content>.*)$')
. = merge(., parsed)
if err != null {
.parse_error = err
}
.log_process = "nginx"
.log_type = "cas.nginx.error"
.environment = "${ENVIRONMENT}"
parse_cas_application_logs:
type: remap
inputs:
- 'collect_cas_application_logs'
source: |
parsed = parse_regex!(.message, r'^\[(?P<time>\d{4}-\d{2}-\d{2}\w+:\d{2}:\d{2})\] (?P<log_level>\w+) \[(?P<module_name>[a-zA-Z0-9-_.]+):(?P<line_number>\d+)\] (?P<message>.*)')
if err != null {
.parse_error = err
}
. = merge(., parsed)
.log_process = "cas"
.log_type = "cas.application"
.environment = "${ENVIRONMENT}"
enrich_cas_application_logs:
type: aws_ec2_metadata
inputs:
- 'parse_cas_application_logs'
namespace: ec2
parse_auth_logs:
{{ salt.pillar.get('vector:base_auth_log_parse_source')|yaml(False)|indent(10) }}
sinks:
ship_cas_logs_to_grafana_cloud:
inputs:
- 'filter_healthchecks_cas_nginx_access_logs'
- 'parse_cas_nginx_error_logs'
- 'enrich_cas_application_logs'
- 'parse_auth_logs'
type: loki
labels:
application: cas
environment: ${ENVIRONMENT}
service: cas
{{ salt.pillar.get('vector:base_loki_configuration')|yaml(False)|indent(10) }}
configurations:
- host_metrics
- auth_logs
- nginx_logs
- cas_logs

config_elements:
application_name: 'cas'
service_name: 'cas'
93 changes: 15 additions & 78 deletions pillar/vector/init.sls
Original file line number Diff line number Diff line change
@@ -1,80 +1,17 @@
{% set ENVIRONMENT = salt.grains.get('environment', 'dev') %}

vector:
base_auth_log_collection:
type: file
file_key: log_file
read_from: end
include:
- /var/log/auth.log
base_auth_log_parse_source:
type: remap
inputs:
- 'collect_auth_logs'
source: |
parsed, err = parse_syslog(.message)
if err != null {
.parse_error = err
}
. = merge(., parsed)
.log_process = "authlog"
.environment = "${ENVIRONMENT}"
# These two are intentionally incomplete sink configurations. The type, inputs, and labels
# need to be provided on a configuration-by-configuration basis.
base_loki_configuration:
auth:
strategy: basic
password: __vault__::secret-operations/global/grafana-cloud-credentials>data>api_key
user: __vault__::secret-operations/global/grafana-cloud-credentials>data>loki_user
endpoint: https://logs-prod-us-central1.grafana.net
encoding:
codec: json
out_of_order_action: rewrite_timestamp
base_cortex_configuration:
endpoint: https://prometheus-prod-10-prod-us-central-0.grafana.net/api/prom/push
healthcheck: false
auth:
strategy: basic
user: __vault__::secret-operations/global/grafana-cloud-credentials>data>prometheus_user
password: __vault__::secret-operations/global/grafana-cloud-credentials>data>api_key

# By default, there are no extra vector configurations to add
extra_configurations: []
# This list only applies if there is not a more specific vector:configurations
# defined elsewhere. If there is, and you would like to include these elements as well,
# you will need to explicitly state them again.
configurations:
- host_metrics
- auth_logs

# Call out host metrics in their own area because they will be enabled globally
host_metrics_configuration:
sources:
collect_host_metrics:
type: host_metrics
scrape_interval_secs: 60
collectors:
- cpu
- filesystem
- load
- host
- memory
- network
transforms:
cleanup_host_metrics:
type: remap
inputs:
- 'collect_host_metrics'
source: |
# Drop all the not-real filesystems metrics
abort_match_filesystem, err = !(match_any(.tags.filesystem, [r'ext.', r'btrfs', r'xfs']))
if abort_match_filesystem {
abort
}
add_labels_to_host_metrics:
type: remap
inputs:
- 'cleanup_host_metrics'
source: |
.tags.environment = "${ENVIRONMENT}"
.tags.job = "integrations/linux_host"
sinks:
ship_host_metrics_to_grafana_cloud:
inputs:
- 'add_labels_to_host_metrics'
{{ salt.pillar.get('vector:base_cortex_configuration')|yaml(False)|indent(8) }}
config_elements:
application_name: 'configuration_error_application_name'
service_name: 'configuration-error_service_name'
environment: {{ salt.grains.get('environment', 'configuration_error_environment') }}
grafana_cloud_loki_endpoint: 'https://logs-prod-us-central1.grafana.net'
grafana_cloud_prometheus_endpoint: 'https://prometheus-prod-10-prod-us-central-0.grafana.net/api/prom/push'
grafana_cloud_loki_user: __vault__::secret-operations/global/grafana-cloud-credentials>data>loki_user
grafana_cloud_cortex_user: __vault__::secret-operations/global/grafana-cloud-credentials>data>prometheus_user
grafana_cloud_password: __vault__::secret-operations/global/grafana-cloud-credentials>data>api_key
55 changes: 8 additions & 47 deletions pillar/vector/ocw_build.sls
Original file line number Diff line number Diff line change
@@ -1,48 +1,9 @@
vector:
configuration:
api:
enabled: true

log_schema:
timestamp_key: vector_timestamp
host_key: log_host

sources:
webhook_publish_log:
type: file
include:
- /opt/ocw/logs/webhook-publish.log

transforms:
webhook_publish_log_parser:
inputs:
- webhook_publish_log
type: remap
source: |
matches, err = parse_regex(
.message,
r'^(?P<time>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{9}) (?P<message>.*)'
)
if matches != null {
.message = matches.message
.@timestamp = parse_timestamp!(matches.time, "%F %T%.9f")
.labels = ["ocw_build"]
.environment = "{{ salt.grains.get('environment') }}"
} else {
log(err, level: "error")
.malformed = true
}
webhook_publish_malformed_message_filter:
inputs:
- webhook_publish_log_parser
type: filter
condition: .malformed != true

sinks:
es_cluster:
inputs:
- webhook_publish_malformed_message_filter
type: elasticsearch
endpoint: 'http://operations-elasticsearch.query.consul:9200'
index: logstash-ocw-build-%Y.%W
healthcheck: false
configurations:
- host_metrics
- auth_logs
- ocw_build_logs

config_elements:
application_name: 'ocw_build'
service_name: 'ocw_builds'
Loading

0 comments on commit b9123c4

Please sign in to comment.