一、sysdig
# docker run -it --rm --name sysdig --privileged=true --volume=/var/run/docker.sock:/host/var/run/docker.sock --volume=/dev:/host/dev --volume=/proc:/host/proc:ro --volume=/boot:/host/boot:ro --volume=/lib/modules:/host/lib/modules:ro --volume=/usr:/host/usr:ro sysdig/sysdig
root@58e0debbe53a:/# csysdig
下载插件失败后可以运行下边的命令,重新下载
root@58e0debbe53a:/# sysdig-probe-loader
二、Weave scope
# curl -L git.io/scope -o /usr/local/bin/scope
# chmod a+x /usr/local/bin/scope
# scope launch
三、Prometheus(普罗米修斯)
组件 | 说明 |
---|---|
Prometheus Server | 普罗米修斯的主服务器 |
NodeEXporter | 负责收集Host硬件信息和操作系统信息 |
cAdvisor | 负责收集Host上运行的容器信息 |
Grafana | 负责展示普罗米修斯监控界面 |
1、安装部署
主机名称 | IP地址 | 安装组件 |
---|---|---|
machine | 192.168.1.10 | NodeEXporter、cAdvisor、Prometheus Server、Grafana |
docker02 | 192.168.1.20 | NodeEXporter、cAdvisor |
docker03 | 192.168.1.30 | NodeEXporter、cAdvisor |
1、部署node-EXporter和cAdvisor
# docker run -d -p 9100:9100 -v /proc/:/host/proc --name export -v /sys:/host/sys -v /:/rootfs --net=host prom/node-exporter --path.procfs /host/proc --path.sysfs /host/sys --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|hosit|etc)($|/)"
2、部署安装cAdvisor
# docker run -v /:/rootfs:ro -v /var/run:/var/run/:rw -v /sys:/sys:ro -v /var/lib/docker:/var/lib/docker:ro -p 8080:8080 --detach=true --name=cadvisor --net=host google/cadvisor
3、部署Prometheus Server服务
# docker run -d --name prometheus prom/prometheus
# docker cp prometheus:/etc/prometheus/prometheus.yml ./
# docker stop prometheus
prometheus
# docker rm prometheus
prometheus
# vim prometheus.yml
static_configs: //添加各节点容器收集信息
- targets: ['localhost:9090','localhost:8080','localhost:9100','192.168.1.20:8080','192.168.1.20:9100','192.168.1.30:8080','192.168.1.30:9100'] //
# docker run -d -p 9090:9090 --name prometheus --net=host -v /root/prometheus.yml:/etc/prometheus/prometheus.yml prom/prometheus
4、部署grafana服务
# mkdir grafana-storage
# chmod -R 777 grafana-storage/
# docker run -d -p 3000:3000 --name grafana -v /root/grafana-storage:/var/lib/grafana -e "GF_SECURITY_ADMIN_PASSWORD=123.com" grafana/grafana
2、添加监控数据
新版可以直接指定密码,旧版本默认登录账号密码都为admin
3、导入监控模板
grafana官网: https://grafana.com/
1、JSON文件方式
2、添加模板的ID号
4、配置邮件报警
1、配置AlertManager
# docker run --name alertmanager -d -p 9093:9093 prom/alertmanager:latest
# docker cp alertmanager:/etc/alertmanager/alertmanager.yml ./
# cp alertmanager.yml alertmanager.ymlbak
2、打开SMTP服务
3、更改配置文件
# vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: '*******@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '*******@qq.com'
smtp_auth_password: '*******' #SMTP授权码
smtp_require_tls: false
smtp_hello: 'qq.com'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '*******@qq.com'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
# docker rm -f alertmanager
alertmanager
# docker run -d --name alertmanager -p 9093:9093 -v /root/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager
4、配置告警规则
# mkdir -p prometheus/rules
# cd prometheus/rules/
# vim node-up.rules
groups:
- name: node-up
rules:
- alert: node-up
expr: up{job="prometheus"} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} 已停止运行超过 15s!"
# vim prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.1.10:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/usr/local/prometheus/rules/*.rules"
# docker rm -f prometheus
prometheus
# docker run -d -p 9090:9090 -v /root/prometheus.yml:/etc/prometheus/prometheus.yml -v /root/prometheus/rules:/usr/local/prometheus/rules --name prometheus --net=host prom/prometheus
5、触发报警发送 Email
6、配置自定义邮件模板
# cd prometheus/
# mkdir alertmanager-tmpl
# cd alertmanager-tmpl/
# vim email.tmpl
{{ define "email.from" }}*******@qq.com{{ end }}
{{ define "email.to" }}*******@qq.com{{ end }}
{{ define "email.to.html" }}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert<br>
告警级别: {{ .Labels.severity }} 级<br>
告警类型: {{ .Labels.alertname }}<br>
故障主机: {{ .Labels.instance }}<br>
告警主题: {{ .Annotations.summary }}<br>
触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
=========end==========<br>
{{ end }}
{{ end }}
# cd
# vim alertmanager.yml
# docker restart prometheus
prometheus
# vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: '*******@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '*******@qq.com'
smtp_auth_password: '*******'
smtp_require_tls: false
smtp_hello: 'qq.com'
templates:
- '/etc/alertmanager-tmpl/*.tmpl'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '{{ template "email.to" }}'
html: '{{ template "email.to.html" . }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
# docker rm -f alertmanager
alertmanager
# docker run -d --name alertmanager -p 9093:9093 -v /root/alertmanager.yml:/etc/alertmanager/alertmanager.yml -v /root/prometheus/alertmanager-tmpl:/etc/alertmanager-tmpl prom/alertmanager
评论区