1.下载process-exporter
wget https://github.com/ncabatoff/process-exporter/releases/download/v0.7.10/process-exporter-0.7.10.linux-amd64.tar.gz
2 安装部署process-exporter
tar -xf process-exporter-0.7.10.linux-amd64.tar.gz
# 解压并安装
sudo mv process-exporter-0.7.10.linux-amd64 /usr/local/prcess-exporter
注册到系统服务
cat > /usr/lib/systemd/system/process_exporter.service << EOF
[Unit]
Description=process_exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/process_exporter/process-exporter -config.path=/usr/local/process_exporter/process-conf.yaml
Restart=always
[Install]
WantedBy=multi-user.target
EOF
加载并开机自启
systemctl daemon-reload && systemctl enable process_exporter
启动process exporter
systemctl daemon-reload && systemctl enable process_exporter
3 配置process-exporter
指定一个进程:
process_names:
- name: "{{.Matches}}"
cmdline:
- 'redis-server'
指定多个进程:
process_names:
- name: "{{.Matches}}"
cmdline:
- 'test1'
- name: "{{.Matches}}"
cmdline:
- 'test2'
- name: "{{.Matches}}"
cmdline:
- 'test3'
指定所有进程:
process_names:
- name: "{{.Comm}}"
cmdline:
- '.+'
4 配置Prometheus
- job_name: test05进程监控
scrape_interval: 2m
scrape_timeout: 120s
file_sd_configs:
- files:
- /usr/local/prometheus/sd_config/test05-process.json
cat /usr/local/prometheus/sd_config/test05-process.json
[
{
"labels": {
"desc": "lrma",
"group": "lrma",
"host_ip": "192.168.11.55",
"hostname": "test05"
},
"targets": [
"192.168.11.55:9256"
]
},
{
"labels": {
"desc": "lrma",
"group": "lrma",
"host_ip": "192.168.11.56",
"hostname": "test06"
},
"targets": [
"192.168.11.56:9256"
]
}
]
重启prometheus
systemctl restart prometheus
5 接入Grafana图形化展示
输入导入的模板id 249,数据源选择Prometheus
6 设置告警
groups:
- name: redis-server进程挂了
rules:
- alert: redis-server进程挂了
expr: (namedprocess_namegroup_num_procs{groupname="redis-server"}) == 0
for: 1m
labels:
severity: error
status: 非常严重
annotations:
summary: "{{ labels.hostname }} 上redis-server Alert {{labels.instance }} has been down for more than 1 minutes"
description: "redis-server 进程挂了, 当前进程值为 {{ .value }}"