1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
| cat >> prometheus/alert.yml <<"EOF" - name: node-exporter rules: - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: warning annotations: summary: "主机内存不足,实例:{{ $labels.instance}}" description: "内容可用率<10%,当前值:{{ $value}}" - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: summary: "内存压力不足,实例:{{ $labels.instance}}" description: "节点内存压力大。重大页面错误率高,当前值为: {{ $value}}" - alert: HostUnusualNetworkThroughputIn expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 /1024 > 100 for: 5m labels: severity: warning annotations: summary: "异常流入网络吞吐量,实例:{{ $labels.instance}}" description: "网络流入流量 > 100MB/s,当前值:{{ $value}}" - alert: HostUnusualNetworkThroughputOut expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: summary: "异常流出网络吞吐量,实例:{{ $labels.instance}}" description: "网络流出流量 > 100MB/s,当前值:{{ $value}}" - alert: HostUnusualDiskReadRate expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 /1024 > 50 for: 5m labels: severity: warning annotations: summary: "异常磁盘读取,实例:{{ $labels.instance}}" description: "磁盘读取> 50MB/s,当前值: {{ $value}}" - alert: HostunusualDiskWriteRate expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: summary: "异常磁盘写入,实例:{{ $labels.instance}}" description: "磁盘写入 > 50MB/s,当前值:{{ $value}}" - alert: HostOutofDiskSpace expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device,mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: summary: "磁盘空间不足告警,实例:{{ $labels.instance}}" description: "剩余磁盘空间 < 10%, 当前值:{{ $value}}" - alert: HostDiskWillFillIn24Hours expr: (node_filesystem_avail_bytes * 100 ) / node_filesystem_size_bytes < 10 and ON (instance,device,mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance,device,mountpoint) node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: summary: "磁盘空间将在24小时内耗尽,实例: {{ $labels.instance}}" description: "以当前写入速率预计磁盘空间将在24小时耗尽,当前值: {{ $value}}" - alert: HostOutOfInodes expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint = "/"} * 100 < 10 and ON (instance,device,mountpoint) node_filesystem_readonly{mountpoint="/"} == 0 for: 2m labels: severity: warning annotations: summary: "磁盘Inodes不足,实例:{{ $labels.instance}}" description: "剩余磁盘 indoes < 10%, 当前值: {{ $value}}" - alert: HostUnusualDiskReadLatency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: summary: "异常磁盘读取延迟,实例:{{ $labels.instance}}" description: "磁盘读取延迟 > 100ms, 当前值: {{ $value}}" - alert: HostUnusualDiskWriteLatency expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: summary: "异常磁盘写入延迟,实例:{{ $labels.instance}}" description: "磁盘写入延迟 > 100ms, 当前值: {{ $value}}" - alert: high_load expr: node_load1 > 4 for: 2m labels: severity: page annotations: summary: "CPU1分钟负载过高,实例:{{ $labels.instance}}" description: "CPU1分钟负载 > 4, 当前值: {{ $value}}" - alert: HostCpuIsUnderUtilized expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100 ) > 80 for: 1m labels: severity: warning annotations: summary: "CPU负载过高,实例:{{ $labels.instance}}" description: "CPU负载 > 80%, 当前值: {{ $value}}" - alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 0m labels: severity: warning annotations: summary: "CPU窃取率异常,实例:{{ $labels.instance}}" description: "CPU窃取率 > 10%, 嘈杂的邻居正在扼杀 VM 性能,或者 Spot 实例可能失去信用,当前值: {{ $value}}" - alert: HostSwapIsFillingUp expr: (1- (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 2m labels: severity: warning annotations: summary: "磁盘swap空间使用率异常,实例:{{ $labels.instance}}" description: "磁盘swap空间使用率 > 80%, 当前值: {{ $value}}" - alert: HostNetworkReceiveErrors expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: summary: "异常网络接收错误,实例:{{ $labels.instance}}" description: "网卡{{ $labels.deveice}} 在过去2分钟接收 {{ $value}} 个错误" - alert: HostNetworkTransmitErrors expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: summary: "异常网络传输错误,实例:{{ $labels.instance}}" description: "网卡{{ $labels.deveice}} 在过去2分钟传输 {{ $value}} 个错误" - alert: HostNetworkInterfaceSaturated expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000 for: 1m labels: severity: warning annotations: summary: "异常网络接口饱和,实例:{{ $labels.instance}}" description: "网卡{{ $labels.deveice}}正在超载,当前值 {{ $value}}" - alert: HostConnttrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m labels: severity: warning annotations: summary: "异常连接数,实例:{{ $labels.instance}}" description: "连接数过大,当前连接数: {{ $value}}" - alert: HostClockSkew expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0 ) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) for: 2m labels: severity: warning annotations: summary: "异常时钟偏差,实例:{{ $labels.instance}}" description: "检测到时钟偏差,时钟不同步。值为: {{ $value}}" - alert: HostClockNotSynchronising expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 for: 2m labels: severity: warning annotations: summary: "时钟不同步,实例:{{ $labels.instance}}" description: "时钟不同步。" EOF
|