deploy
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
mkdir -p monitoring/prometheus cd monitoring
cat > docker-compose.yml << EOF version: '3.8'
networks: monitoring: driver: bridge
services: prometheus: image: registry.cn-hangzhou.aliyuncs.com/lky-deploy/prometheus:v2.40.7 container_name: prometheus restart: unless-stopped ports: - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - prom_data:/prometheus networks: - monitoring command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle'
node-exporter: image: registry.cn-hangzhou.aliyuncs.com/lky-deploy/nodeexporter:v1.9.1 container_name: node-exporter restart: unless-stopped command: - '--path.rootfs=/host' - '--web.listen-address=:9400' network_mode: host pid: host volumes: - /:/host:ro,rslave
blackbox-exporter: image: registry.cn-hangzhou.aliyuncs.com/lky-deploy/nodeexporter:blackbox-exporterv0.27.0 container_name: blackbox-exporter restart: unless-stopped ports: - "9115:9115" networks: - monitoring volumes: - ./blackbox.yml:/etc/blackbox_exporter/config.yml command: - '--config.file=/etc/blackbox_exporter/config.yml'
process-exporter: image: registry.cn-hangzhou.aliyuncs.com/lky-deploy/nodeexporter:process-exporter-v0.8.7 container_name: process-exporter restart: unless-stopped network_mode: host ports: - "9256:9256" volumes: - /proc:/host/proc:ro - ./process-exporter.yml:/config.yml command: - '-config.path=/config.yml' - '-procfs=/host/proc'
volumes: prom_data: EOF
cat > prometheus/prometheus.yml <<EOF global: scrape_interval: 15s evaluation_interval: 15s
scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090']
- job_name: 'node' static_configs: - targets: ['node-exporter:9400'] - job_name: 'blackbox-http' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://qq.com - https://google.com relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115
- job_name: 'process-exporter' static_configs: - targets: ['process-exporter:9256'] EOF
cat > blackbox.yml <<EOF modules: http_2xx: prober: http timeout: 5s http: valid_http_versions: ["HTTP/1.1", "HTTP/2"] valid_status_codes: [200] method: GET preferred_ip_protocol: "ip4" EOF
cat > process-exporter.yml <<EOF process_names: - name: "{{.Comm}}" cmdline: - '.+' EOF
docker compose up -d
echo -e "\n\033[32m部署完成!以下是访问信息:\033[0m" echo "Prometheus: http://localhost:9090" echo "Node Exporter: http://localhost:9100/metrics" echo "Blackbox: http://localhost:9115/metrics" echo "Process-Exporter: http://localhost:9256/metrics" echo -e "\n请修改以下配置文件后重启服务:" echo "- blackbox.yml 中的监控目标" echo "- process-exporter.yml 中的进程过滤规则"
|
如果节点较多prometheus对所有指标的采集会对负载和磁盘占用较多,可以通过relabel drop不需要的指标,减轻负担
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
|
scrape_configs: - job_name: 'node-drop' static_configs: - targets: ['localhost:9100'] metric_relabel_configs: - source_labels: [__name__] regex: '^(node_cpu_seconds_total|node_memory_.*|node_disk_.*|node_network_.*)$' action: keep - job_name: 'node' static_configs: - targets: ['node1:9100', 'node2:9100','master:9100'] relabel_configs: - source_labels: [__address__] regex: 'node1:9100' replacement: 'prod' target_label: environment - source_labels: [__address__] regex: 'node2:9100' replacement: 'staging' target_label: environment - source_labels: [__address__] regex: 'node[0-9]+:9100' action: keep - source_labels: [__meta_kubernetes_pod_label_app] regex: "nginx|api-server" action: keep
node_exporter \ --collector.cpu \ --collector.meminfo \ --no-collector.diskstats \ --no-collector.netdev \ --no-collector.filesystem \
|
file_sd
可以基于文件动态更新 prometheus 的监控节点
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| #文件类型/etc/prometheus/targets/nodes.json [ { "targets": ["192.168.1.10:9100"], # 监控目标地址(IP:Port) "labels": { # 自定义标签(可选) "env": "prod", "role": "web-server" } }, { "targets": ["192.168.1.11:9100","192.168.3.11:9100"], "labels": { "env": "staging", "role": "db-server" } } ] #prometheus配置 scrape_configs: - job_name: "node-exporter" # 任务名称 file_sd_configs: # 启用 file_sd - files: - "/etc/prometheus/targets/*.json" # 目标文件路径(支持通配符) - "/etc/prometheus/targets/mysql-exporters/*.json" # MySQL 监控 refresh_interval: 5m # 重新加载间隔(默认 5m)
|
node exporter textfile
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| #!/bin/bash
DURATION=15 # 默认抓包时长(建议比 cron 间隔稍短) INTERFACE="eth0" OUTPUT_FILE="/tmp/traffic.pcap" METRICS_FILE="/etc/node-exporter/textfile-collector/network_traffic.prom" # Node Exporter 收集目录
# 安装依赖(如未安装) if ! command -v tcpdump &>/dev/null || ! command -v tshark &>/dev/null; then echo "安装依赖: tcpdump 和 tshark..." sudo apt-get update && sudo apt-get install -y tcpdump tshark fi
# 捕获流量 sudo timeout $DURATION tcpdump -i $INTERFACE -w $OUTPUT_FILE >/dev/null 2>&1
# 生成 Prometheus 格式的指标 sudo tshark -r $OUTPUT_FILE -T fields -e ip.src -e ip.dst -e frame.len 2>/dev/null \ | awk ' BEGIN { total_bytes = 0 delete bytes # 清空数组 } { bytes[$1] += $3; # 源IP统计 bytes[$2] += $3; # 目的IP统计 total_bytes += $3 } END { # 输出总流量指标 print "network_traffic_total_bytes " total_bytes
# 输出每个IP的流量指标 for (ip in bytes) { if (ip != "") { # 过滤空值 printf "network_traffic_bytes{ip=\"%s\"} %d\n", ip, bytes[ip] } } }' > "$METRICS_FILE.$$" # 先写入临时文件
# 原子操作替换文件(避免读取半成品) sudo mv "$METRICS_FILE.$$" "$METRICS_FILE"
# 清理 sudo rm -f "$OUTPUT_FILE"
./node_exporter --web.listen-address=":900" --collector.textfile.directory=/etc/node-exporter/textfile-collector/
|
blackbox_exporter
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
| modules: http_2xx: prober: http timeout: 5s http: valid_http_versions: ["HTTP/1.1", "HTTP/2"] valid_status_codes: [200] method: GET preferred_ip_protocol: "ip4"
ssh_banner_check: prober: tcp timeout: 10s tcp: query_response: - expect: "^SSH-2.0-OpenSSH" send: "SSH-2.0-blackbox-ssh-check" preferred_ip_protocol: "ip4"
scrape_configs: - job_name: 'blackbox-http' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://qq.com - https://google.com relabel_configs: &common_relabel - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115
- job_name: 'blackbox-ssh' metrics_path: /probe params: module: [ssh_banner_check] static_configs: - targets: - '127.0.0.1:22' - '10.0.1.122:22' relabel_configs: *common_relabel
Prometheus 抓取任务生成 │ ├─ 原始目标: example.com:80 │ │ │ ├─ relabel 规则1: 将地址赋值给 __param_target → ?target=example.com:80 │ ├─ relabel 规则2: 用 __param_target 标记 instance → instance="example.com:80" │ └─ relabel 规则3: 重写地址 → 实际请求发送到 Blackbox Exporter │ │ │ └─ Blackbox Exporter 收到请求,解析参数后探测 example.com:80 │ └─ 原始目标: google.com:443 └─ 同理生成请求 http://192.168.100.100:9115/probe?target=google.com:443&module=http_2xx
probe_success{job="blackbox-http"} == 0
probe_http_status_code{job="blackbox-http"} < 200 or probe_http_status_code{job="blackbox-http"} >= 300
probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 86400 * 30
probe_success{job="blackbox-tcp"} == 0
probe_duration_seconds{job="blackbox-http"} > 1
probe_dns_lookup_time_seconds
|
process-exporter
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| {{.Comm}} 包含原始可执行文件的基本名称,即 /proc/<pid>/stat 中的第 2 个字段,并截取前15个字符 {{.ExeBase}} 包含可执行文件的基本名称 {{.ExeFull}} 包含可执行文件的完全限定路径 {{.Username}} 包含有效用户的用户名 {{.Matches}} map 包含应用 cmdline 正则表达式产生的所有匹配项 {{.PID}} 包含进程的 PID。请注意,使用 PID 意味着该组将仅包含一个进程 {{.StartTime}} 包含进程的开始时间。这与 PID 结合使用时非常有用,因为 PID 会随着时间的推移而被重用。 {{.Cgroups}} 包含(如果支持)进程的 cgroups (/proc/self/cgroup)。这对于识别进程属于哪个容器特别有用
process_names: - name: "{{.Comm}}" cmdline: - 'docker*' - name: "{{.Matches}}" cmdline: - 'nginx*' - name: "{{.ExeFull}}" cmdline: - 'mysql*'
namedprocess_namegroup_num_procs{groupname!~".*process-exporter.*"} == 0 namedprocess_namegroup_states{state="Z"} > 0 namedprocess_namegroup_num_procs{groupname="nginx"} == 0
100 * rate(namedprocess_namegroup_cpu_seconds_total{groupname="java"}[5m]) 100 * rate(namedprocess_namegroup_cpu_seconds_total{}[5m]) > 50
namedprocess_namegroup_memory_bytes{groupname="java"} / 1024^2 除于
node_memory_MemTotal_bytes / 1024 / 1024
rate(namedprocess_namegroup_read_bytes_total{groupname="mysql"}[5m]) / 1024^2
rate(namedprocess_namegroup_write_bytes_total{groupname="mysql"}[5m]) / 1024^2
|
prometheus联邦
- 数量较多的情况下从多个下级 Prometheus 实例中提取特定指标,汇总到中心 Prometheus
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| Central Prometheus ↑ 从多个下级拉取聚合后的指标 | +---------------+---------------+ | | | Region A Region B Region C Prometheus Prometheus Prometheus
scrape_configs: - job_name: 'federate-regions' scrape_interval: 1m honor_labels: true metrics_path: '/federate' params: 'match[]': - '{job="api-server"}' - '{__name__=~"job:.*"}' - 'up{instance=~".+"}' static_configs: - targets: - 'prometheus-region-a:9090' - 'prometheus-region-b:9090' - 'prometheus-region-c:9090'
match[] 过滤条件 精确匹配:'{job="mysql"}' 拉取所有 job=mysql 的指标。 正则匹配:'__name__=~"http_request_.+"' 拉取以 http_request_ 开头的指标。 组合条件:'{env="prod", app=~"web|api"}' 拉取 prod 环境下 web 或 api 应用的指标。
|