* Monitor CPU, board, and NVMe temperatures * Monitor NVMe SMART health (wear, errors, status) * Avoid unreliable ACPI thermal zones
—
apt update
apt install lm-sensors smartmontools nvme-cli
—
Path:
/usr/lib/check_mk_agent/plugins/sensors
Content:
#!/bin/bash echo "<<<sensors>>>" /usr/bin/sensors
Set permissions:
chmod +x /usr/lib/check_mk_agent/plugins/sensors
—
Path:
/usr/lib/check_mk_agent/local/500_nvme_smart
Content:
#!/bin/bash # CheckMK local check for NVMe SMART via smartctl # Version: 0.1 # Scope: /dev/nvme0|9 TEMP_WARN=70 TEMP_CRIT=75 USED_WARN=80 USED_CRIT=90 for dev in /dev/nvme[0-9]*; do [ -c "$dev" ] || continue disk="$(basename "$dev")" out="$(smartctl -a "$dev" 2>/dev/null)" [ -n "$out" ] || continue model="$(printf '%s\n' "$out" | awk -F: '/Model Number/ {sub(/^[ \t]+/, "", $2); print $2; exit}')" [ -n "$model" ] || model="unknown" health="$(printf '%s\n' "$out" | awk -F: '/SMART overall-health self-assessment test result/ {sub(/^[ \t]+/, "", $2); print $2; exit}')" critical_warning="$(printf '%s\n' "$out" | awk -F: '/^Critical Warning:/ {sub(/^[ \t]+/, "", $2); print $2; exit}')" temp="$(printf '%s\n' "$out" | awk -F: '/^Temperature:/ {sub(/^[ \t]+/, "", $2); print $2; exit}' | awk '{print $1}')" used="$(printf '%s\n' "$out" | awk -F: '/^Percentage Used:/ {gsub(/[% ,]/, "", $2); print $2; exit}')" spare="$(printf '%s\n' "$out" | awk -F: '/^Available Spare:/ {gsub(/[% ,]/, "", $2); print $2; exit}')" spare_thr="$(printf '%s\n' "$out" | awk -F: '/^Available Spare Threshold:/ {gsub(/[% ,]/, "", $2); print $2; exit}')" media_err="$(printf '%s\n' "$out" | awk -F: '/^Media and Data Integrity Errors:/ {gsub(/[ ,]/, "", $2); print $2; exit}')" err_log="$(printf '%s\n' "$out" | awk -F: '/^Error Information Log Entries:/ {gsub(/[ ,]/, "", $2); print $2; exit}')" unsafe="$(printf '%s\n' "$out" | awk -F: '/^Unsafe Shutdowns:/ {gsub(/[ ,]/, "", $2); print $2; exit}')" power_hours="$(printf '%s\n' "$out" | awk -F: '/^Power On Hours:/ {gsub(/[ ,]/, "", $2); print $2; exit}')" [ -n "$health" ] || health="UNKNOWN" [ -n "$critical_warning" ] || critical_warning="UNKNOWN" [ -n "$temp" ] || temp=0 [ -n "$used" ] || used=0 [ -n "$spare" ] || spare=0 [ -n "$spare_thr" ] || spare_thr=10 [ -n "$media_err" ] || media_err=0 [ -n "$err_log" ] || err_log=0 [ -n "$unsafe" ] || unsafe=0 [ -n "$power_hours" ] || power_hours=0 state=0 if [ "$health" != "PASSED" ]; then state=2 elif [ "$critical_warning" != "0x00" ]; then state=2 elif [ "$media_err" -gt 0 ]; then state=2 elif [ "$spare" -le "$spare_thr" ]; then state=2 elif [ "$used" -ge "$USED_CRIT" ]; then state=2 elif [ "$used" -ge "$USED_WARN" ]; then state=1 fi echo "$state \"NVMe SMART $disk\" used=${used};${USED_WARN};${USED_CRIT}|spare=${spare};;;|media_errors=${media_err};;;|error_log_entries=${err_log};;;|unsafe_shutdowns=${unsafe};;;|power_on_hours=${power_hours};;; $model - health=${health}, critical_warning=${critical_warning}, used=${used}%, spare=${spare}%, media_errors=${media_err}, errlog=${err_log}, unsafe_shutdowns=${unsafe}, power_on_hours=${power_hours}" tstate=0 if [ "$temp" -ge "$TEMP_CRIT" ]; then tstate=2 elif [ "$temp" -ge "$TEMP_WARN" ]; then tstate=1 fi echo "$tstate \"NVMe Temperature $disk\" temp=${temp};${TEMP_WARN};${TEMP_CRIT} $model - ${temp}C" done
—
check_mk_agent | grep -A5 NVMe
Expected output:
NVMe SMART nvme0n1 NVMe Temperature nvme0n1
—
* Run Service Discovery * Accept the following services:
—
NVMe temperature thresholds:
* WARN: 70°C * CRIT: 75°C
CPU:
* Use default CheckMK thresholds
—
* NVMe in compact systems (e.g. NUC) tends to run warmer due to limited airflow * Temperatures up to ~65°C under load are typical * Avoid sustained temperatures above 70°C (thermal throttling possible)
—