node-exporter

Overview

The Node Mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by the Node Exporter. The mixin creates recording and alerting rules for Prometheus and suitable dashboard descriptions for Grafana.

Alerts

告警Alerts配置列表 源文件.

node-exporter

NodeFilesystemSpaceFillingUp

alert: NodeFilesystemSpaceFillingUp
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
  summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
  (
    node_filesystem_avail_bytes{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 40
  and
    predict_linear(node_filesystem_avail_bytes{job="node_exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 1h
labels:
  severity: warning
NodeFilesystemSpaceFillingUp

alert: NodeFilesystemSpaceFillingUp
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
  summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
  (
    node_filesystem_avail_bytes{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 20
  and
    predict_linear(node_filesystem_avail_bytes{job="node_exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 1h
labels:
  severity: critical
NodeFilesystemAlmostOutOfSpace

alert: NodeFilesystemAlmostOutOfSpace
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
  summary: Filesystem has less than 5% space left.
expr: |
  (
    node_filesystem_avail_bytes{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 5
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 30m
labels:
  severity: warning
NodeFilesystemAlmostOutOfSpace

alert: NodeFilesystemAlmostOutOfSpace
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
  summary: Filesystem has less than 3% space left.
expr: |
  (
    node_filesystem_avail_bytes{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 3
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 30m
labels:
  severity: critical
NodeFilesystemFilesFillingUp

alert: NodeFilesystemFilesFillingUp
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
  summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
  (
    node_filesystem_files_free{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 40
  and
    predict_linear(node_filesystem_files_free{job="node_exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 1h
labels:
  severity: warning
NodeFilesystemFilesFillingUp

alert: NodeFilesystemFilesFillingUp
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
  summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
  (
    node_filesystem_files_free{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 20
  and
    predict_linear(node_filesystem_files_free{job="node_exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 1h
labels:
  severity: critical
NodeFilesystemAlmostOutOfFiles

alert: NodeFilesystemAlmostOutOfFiles
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
  summary: Filesystem has less than 5% inodes left.
expr: |
  (
    node_filesystem_files_free{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 5
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 1h
labels:
  severity: warning
NodeFilesystemAlmostOutOfFiles

alert: NodeFilesystemAlmostOutOfFiles
annotations:
  description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
  summary: Filesystem has less than 3% inodes left.
expr: |
  (
    node_filesystem_files_free{job="node_exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node_exporter",fstype!="",mountpoint!=""} * 100 < 3
  and
    node_filesystem_readonly{job="node_exporter",fstype!="",mountpoint!=""} == 0
  )  
for: 1h
labels:
  severity: critical
NodeNetworkReceiveErrs

alert: NodeNetworkReceiveErrs
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
  summary: Network interface is reporting many receive errors.
expr: |
  rate(node_network_receive_errs_total{job="node_exporter"}[2m]) / rate(node_network_receive_packets_total{job="node_exporter"}[2m]) > 0.01  
for: 1h
labels:
  severity: warning
NodeNetworkTransmitErrs

alert: NodeNetworkTransmitErrs
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
  summary: Network interface is reporting many transmit errors.
expr: |
  rate(node_network_transmit_errs_total{job="node_exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node_exporter"}[2m]) > 0.01  
for: 1h
labels:
  severity: warning
NodeHighNumberConntrackEntriesUsed

alert: NodeHighNumberConntrackEntriesUsed
annotations:
  description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
  summary: Number of conntrack are getting close to the limit.
expr: |
  (node_nf_conntrack_entries{job="node_exporter"} / node_nf_conntrack_entries_limit) > 0.75  
labels:
  severity: warning
NodeTextFileCollectorScrapeError

alert: NodeTextFileCollectorScrapeError
annotations:
  description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape.
  summary: Node Exporter text file collector failed to scrape.
expr: |
  node_textfile_scrape_error{job="node_exporter"} == 1  
labels:
  severity: warning
NodeClockSkewDetected

alert: NodeClockSkewDetected
annotations:
  description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
  summary: Clock skew detected.
expr: |
  (
    node_timex_offset_seconds{job="node_exporter"} > 0.05
  and
    deriv(node_timex_offset_seconds{job="node_exporter"}[5m]) >= 0
  )
  or
  (
    node_timex_offset_seconds{job="node_exporter"} < -0.05
  and
    deriv(node_timex_offset_seconds{job="node_exporter"}[5m]) <= 0
  )  
for: 10m
labels:
  severity: warning
NodeClockNotSynchronising

alert: NodeClockNotSynchronising
annotations:
  description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
  summary: Clock not synchronising.
expr: |
  min_over_time(node_timex_sync_status{job="node_exporter"}[5m]) == 0
  and
  node_timex_maxerror_seconds{job="node_exporter"} >= 16  
for: 10m
labels:
  severity: warning
NodeRAIDDegraded

alert: NodeRAIDDegraded
annotations:
  description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
  summary: RAID Array is degraded.
expr: |
  node_md_disks_required{job="node_exporter",device!=""} - ignoring (state) (node_md_disks{state="active",job="node_exporter",device!=""}) > 0  
for: 15m
labels:
  severity: critical
NodeRAIDDiskFailure

alert: NodeRAIDDiskFailure
annotations:
  description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
  summary: Failed device in RAID array.
expr: |
  node_md_disks{state="failed",job="node_exporter",device!=""} > 0  
labels:
  severity: warning
NodeFileDescriptorLimit

alert: NodeFileDescriptorLimit
annotations:
  description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
  summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
  (
    node_filefd_allocated{job="node_exporter"} * 100 / node_filefd_maximum{job="node_exporter"} > 70
  )  
for: 15m
labels:
  severity: warning
NodeFileDescriptorLimit

alert: NodeFileDescriptorLimit
annotations:
  description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
  summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
  (
    node_filefd_allocated{job="node_exporter"} * 100 / node_filefd_maximum{job="node_exporter"} > 90
  )  
for: 15m
labels:
  severity: critical
NodeCPUHighUsage

alert: NodeCPUHighUsage
annotations:
  description: |
    CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.    
  summary: High CPU usage.
expr: |
  sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node_exporter", mode!="idle"}[2m]))) * 100 > 90  
for: 15m
labels:
  severity: info
NodeSystemSaturation

alert: NodeSystemSaturation
annotations:
  description: |
    System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
    This might indicate this instance resources saturation and can cause it becoming unresponsive.    
  summary: System saturated, load per core is very high.
expr: |
  node_load1{job="node_exporter"}
  / count without (cpu, mode) (node_cpu_seconds_total{job="node_exporter", mode="idle"}) > 2  
for: 15m
labels:
  severity: warning
NodeMemoryMajorPagesFaults

alert: NodeMemoryMajorPagesFaults
annotations:
  description: |
    Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
    Please check that there is enough memory available at this instance.    
  summary: Memory major page faults are occurring at very high rate.
expr: |
  rate(node_vmstat_pgmajfault{job="node_exporter"}[5m]) > 500  
for: 15m
labels:
  severity: warning
NodeMemoryHighUtilization

alert: NodeMemoryHighUtilization
annotations:
  description: |
    Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.    
  summary: Host is running out of memory.
expr: |
  100 - (node_memory_MemAvailable_bytes{job="node_exporter"} / node_memory_MemTotal_bytes{job="node_exporter"} * 100) > 90  
for: 15m
labels:
  severity: warning
NodeDiskIOSaturation

alert: NodeDiskIOSaturation
annotations:
  description: |
    Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
    This symptom might indicate disk saturation.    
  summary: Disk IO queue is high.
expr: |
  rate(node_disk_io_time_weighted_seconds_total{job="node_exporter", device!=""}[5m]) > 10  
for: 30m
labels:
  severity: warning
NodeSystemdServiceFailed

alert: NodeSystemdServiceFailed
annotations:
  description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}
  summary: Systemd service has entered failed state.
expr: |
  node_systemd_unit_state{job="node_exporter", state="failed"} == 1  
for: 5m
labels:
  severity: warning
NodeBondingDegraded

alert: NodeBondingDegraded
annotations:
  description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures.
  summary: Bonding interface is degraded
expr: |
  (node_bonding_slaves - node_bonding_active) != 0  
for: 5m
labels:
  severity: warning

Recording Rules

指标计算Recording规则配置列表 源文件.

node-exporter.rules

instance:node_num_cpu:sum

expr: |
  count without (cpu, mode) (
    node_cpu_seconds_total{job="node_exporter",mode="idle"}
  )  
record: instance:node_num_cpu:sum
instance:node_cpu_utilisation:rate5m

expr: |
  1 - avg without (cpu) (
    sum without (mode) (rate(node_cpu_seconds_total{job="node_exporter", mode=~"idle|iowait|steal"}[5m]))
  )  
record: instance:node_cpu_utilisation:rate5m
instance:node_load1_per_cpu:ratio

expr: |
  (
    node_load1{job="node_exporter"}
  /
    instance:node_num_cpu:sum{job="node_exporter"}
  )  
record: instance:node_load1_per_cpu:ratio
instance:node_memory_utilisation:ratio

expr: |
  1 - (
    (
      node_memory_MemAvailable_bytes{job="node_exporter"}
      or
      (
        node_memory_Buffers_bytes{job="node_exporter"}
        +
        node_memory_Cached_bytes{job="node_exporter"}
        +
        node_memory_MemFree_bytes{job="node_exporter"}
        +
        node_memory_Slab_bytes{job="node_exporter"}
      )
    )
  /
    node_memory_MemTotal_bytes{job="node_exporter"}
  )  
record: instance:node_memory_utilisation:ratio
instance:node_vmstat_pgmajfault:rate5m

expr: |
  rate(node_vmstat_pgmajfault{job="node_exporter"}[5m])  
record: instance:node_vmstat_pgmajfault:rate5m
instance_device:node_disk_io_time_seconds:rate5m

expr: |
  rate(node_disk_io_time_seconds_total{job="node_exporter", device!=""}[5m])  
record: instance_device:node_disk_io_time_seconds:rate5m
instance_device:node_disk_io_time_weighted_seconds:rate5m

expr: |
  rate(node_disk_io_time_weighted_seconds_total{job="node_exporter", device!=""}[5m])  
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
instance:node_network_receive_bytes_excluding_lo:rate5m

expr: |
  sum without (device) (
    rate(node_network_receive_bytes_total{job="node_exporter", device!="lo"}[5m])
  )  
record: instance:node_network_receive_bytes_excluding_lo:rate5m
instance:node_network_transmit_bytes_excluding_lo:rate5m

expr: |
  sum without (device) (
    rate(node_network_transmit_bytes_total{job="node_exporter", device!="lo"}[5m])
  )  
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
instance:node_network_receive_drop_excluding_lo:rate5m

expr: |
  sum without (device) (
    rate(node_network_receive_drop_total{job="node_exporter", device!="lo"}[5m])
  )  
record: instance:node_network_receive_drop_excluding_lo:rate5m
instance:node_network_transmit_drop_excluding_lo:rate5m

expr: |
  sum without (device) (
    rate(node_network_transmit_drop_total{job="node_exporter", device!="lo"}[5m])
  )  
record: instance:node_network_transmit_drop_excluding_lo:rate5m

Dashboards

仪表盘配置文件下载地址: