More robust

This commit is contained in:
2025-10-05 16:03:51 +02:00
parent 2f8f75dfa1
commit 211d4442e5
5 changed files with 186 additions and 38 deletions

View File

@@ -57,7 +57,7 @@ tail -f /var/log/hdd_temp_monitor.log
|-----------------------|-----------------------------------------------------------------|-----------------------------------------------|
| `MAX_TEMP` | Maximum allowed temperature (°C) before starting shutdown count | `60` |
| `HOT_DURATION` | Consecutive minutes above `MAX_TEMP` before shutdown | `5` |
| `COOL_DURATION` | Consecutive minutes below `MAX_TEMP` required to reset counter | `5` |
| `COOL_RESET_DURATION` | Consecutive minutes below `MAX_TEMP` to reset all counters | `5` |
| `LOG_FILE` | Path to the main log file | `/var/log/hdd_temp_monitor.log` |
| `LOG_ROTATE_COUNT` | Number of log files to keep | `7` |
| `LOG_ROTATE_PERIOD` | Rotation period for logs (`daily` or `weekly`) | `daily` |

View File

@@ -1,36 +1,74 @@
#!/bin/bash
# HotDisk: Monitor SATA disk temperature and notify via Discord
set -euo pipefail
CONF_FILE="/etc/hdd_temp_monitor.conf"
STATE_FILE="/tmp/hdd_temp_state.txt"
STATE_FILE="/tmp/hdd_temp_state"
# Check if configuration file exists
if [[ ! -f "$CONF_FILE" ]]; then
echo "ERROR: Configuration file $CONF_FILE not found!" >&2
exit 1
fi
source "$CONF_FILE"
# Validate required variables
for var in MAX_TEMP HOT_DURATION COOL_RESET_DURATION LOG_FILE DISCORD_WEBHOOK; do
if [[ -z "${!var:-}" ]]; then
echo "ERROR: Required variable $var not set in $CONF_FILE" >&2
exit 1
fi
done
DISKS=$(lsblk -dno NAME,TYPE | awk '$2=="disk"{print $1}' | grep -v '^nvme')
if [[ -z "$DISKS" ]]; then
echo "WARNING: No SATA disks found to monitor" >&2
exit 0
fi
if [ ! -f "$STATE_FILE" ]; then touch "$STATE_FILE"; fi
declare -A HOT_COUNTERS
declare -A COOL_COUNTERS
while read -r line; do
disk=$(echo "$line" | cut -d= -f1)
val=$(echo "$line" | cut -d= -f2)
HOT_COUNTERS[$disk]=$val
[[ -z "$line" || "$line" =~ ^# ]] && continue
if [[ "$line" =~ ^(.+)_HOT=(.+)$ ]]; then
HOT_COUNTERS[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]}
elif [[ "$line" =~ ^(.+)_COOL=(.+)$ ]]; then
COOL_COUNTERS[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]}
fi
done < "$STATE_FILE"
for disk in $DISKS; do
temp=$(smartctl -A /dev/$disk | awk '/Temperature_Celsius/ {print $10; exit}')
[ -z "$temp" ] && continue
# Get temperature with error handling
if ! temp=$(smartctl -A /dev/$disk 2>/dev/null | awk '/Temperature_Celsius/ {print $10; exit}'); then
echo "WARNING: Failed to read temperature for $disk" >&2
continue
fi
# Skip if temperature is empty or not numeric
if [[ -z "$temp" ]] || ! [[ "$temp" =~ ^[0-9]+$ ]]; then
continue
fi
hot=${HOT_COUNTERS[$disk]:-0}
cool=${COOL_COUNTERS[$disk]:-0}
if [ "$temp" -ge "$MAX_TEMP" ]; then
hot=$((hot+1))
cool=0
curl -s -X POST -H "Content-Type: application/json" -d "{\"content\":\"🔥 Warning: $disk is above $MAX_TEMP°C for $hot minute(s)\"}" "$DISCORD_WEBHOOK"
if ! curl -s -X POST -H "Content-Type: application/json" -d "{\"content\":\"🔥 Warning: $disk is above $MAX_TEMP°C for $hot minute(s)\"}" "$DISCORD_WEBHOOK" >/dev/null 2>&1; then
echo "WARNING: Failed to send Discord notification for $disk" >&2
fi
if [ "$hot" -ge "$HOT_DURATION" ]; then
curl -s -X POST -H "Content-Type: application/json" -d "{\"content\":\"⚠️ Critical: $disk has been above $MAX_TEMP°C for $HOT_DURATION minutes. Shutting down...\"}" "$DISCORD_WEBHOOK"
if ! curl -s -X POST -H "Content-Type: application/json" -d "{\"content\":\"⚠️ Critical: $disk has been above $MAX_TEMP°C for $HOT_DURATION minutes. Shutting down...\"}" "$DISCORD_WEBHOOK" >/dev/null 2>&1; then
echo "WARNING: Failed to send critical Discord notification for $disk" >&2
fi
sleep 5
shutdown -h now
fi
else
if [ "$hot" -gt 0 ]; then
cool=$((cool+1))
curl -s -X POST -H "Content-Type: application/json" -d "{\"content\":\"❄️ Notice: $disk is under $MAX_TEMP°C for $cool minute(s)\"}" "$DISCORD_WEBHOOK"
if [ "$cool" -ge "$COOL_DURATION" ]; then
if ! curl -s -X POST -H "Content-Type: application/json" -d "{\"content\":\"❄️ Notice: $disk is under $MAX_TEMP°C for $cool minute(s)\"}" "$DISCORD_WEBHOOK" >/dev/null 2>&1; then
echo "WARNING: Failed to send cool-down Discord notification for $disk" >&2
fi
if [ "$cool" -ge "$COOL_RESET_DURATION" ]; then
hot=0
cool=0
fi
@@ -38,9 +76,31 @@ for disk in $DISKS; do
fi
HOT_COUNTERS[$disk]=$hot
COOL_COUNTERS[$disk]=$cool
echo "$(date '+%Y-%m-%d %H:%M:%S') $disk $temp°C" >> "$LOG_FILE"
done
> "$STATE_FILE"
for disk in "${!HOT_COUNTERS[@]}"; do
echo "$disk=${HOT_COUNTERS[$disk]}" >> "$STATE_FILE"
# Ensure log directory exists and log the temperature
LOG_DIR=$(dirname "$LOG_FILE")
if [[ ! -d "$LOG_DIR" ]]; then
mkdir -p "$LOG_DIR" 2>/dev/null || echo "WARNING: Cannot create log directory $LOG_DIR" >&2
fi
echo "$(date '+%Y-%m-%d %H:%M:%S') $disk $temp°C" >> "$LOG_FILE" 2>/dev/null || echo "WARNING: Cannot write to log file $LOG_FILE" >&2
done
# Atomic state file update - write to temp file then move
TEMP_STATE_FILE="${STATE_FILE}.tmp.$$"
{
for disk in "${!HOT_COUNTERS[@]}"; do
echo "${disk}_HOT=${HOT_COUNTERS[$disk]}"
done
for disk in "${!COOL_COUNTERS[@]}"; do
echo "${disk}_COOL=${COOL_COUNTERS[$disk]}"
done
} > "$TEMP_STATE_FILE"
# Atomic move - this operation is atomic on most filesystems
if mv "$TEMP_STATE_FILE" "$STATE_FILE" 2>/dev/null; then
: # Success - do nothing
else
echo "WARNING: Failed to update state file atomically" >&2
# Cleanup temp file if move failed
rm -f "$TEMP_STATE_FILE" 2>/dev/null || true
fi

View File

@@ -1,11 +1,24 @@
#!/bin/bash
set -euo pipefail
# Function to run commands with sudo only if not root
run_as_root() {
if [[ $EUID -eq 0 ]]; then
"$@"
else
sudo "$@"
fi
}
BASE_URL="https://git.djeex.fr/Djeex/hotdisk/raw/branch/main/sh"
SCRIPTS=("hotdisk.sh" "hotdisk_logger.sh" "install_hotdisk.sh")
sudo apt update
sudo apt install -y smartmontools curl
sudo mkdir -p /usr/local/bin
run_as_root mkdir -p /usr/local/bin
for script in "${SCRIPTS[@]}"; do
sudo curl -fsSL "$BASE_URL/$script" -o "/usr/local/bin/$script"
sudo chmod +x "/usr/local/bin/$script"
echo "Downloading $script..."
if ! run_as_root curl -fsSL "$BASE_URL/$script" -o "/usr/local/bin/$script"; then
echo "ERROR: Failed to download $script" >&2
exit 1
fi
run_as_root chmod +x "/usr/local/bin/$script"
done
sudo /usr/local/bin/install_hotdisk.sh
run_as_root /usr/local/bin/install_hotdisk.sh

View File

@@ -1,8 +1,32 @@
#!/bin/bash
set -euo pipefail
# Function to run commands with sudo only if not root
run_as_root() {
if [[ $EUID -eq 0 ]]; then
"$@"
else
sudo "$@"
fi
}
CONF_FILE="/etc/hdd_temp_monitor.conf"
if [[ ! -f "$CONF_FILE" ]]; then
echo "ERROR: Configuration file $CONF_FILE not found!" >&2
exit 1
fi
source "$CONF_FILE"
# Validate required variables
for var in LOG_FILE LOG_ROTATE_PERIOD LOG_ROTATE_COUNT; do
if [[ -z "${!var:-}" ]]; then
echo "ERROR: Required variable $var not set in $CONF_FILE" >&2
exit 1
fi
done
LOGROTATE_FILE="/etc/logrotate.d/hotdisk"
sudo tee "$LOGROTATE_FILE" > /dev/null <<EOF
run_as_root tee "$LOGROTATE_FILE" > /dev/null <<EOF
$LOG_FILE {
$LOG_ROTATE_PERIOD
rotate $LOG_ROTATE_COUNT

View File

@@ -1,13 +1,32 @@
#!/bin/bash
set -euo pipefail
# Function to run commands with sudo only if not root
run_as_root() {
if [[ $EUID -eq 0 ]]; then
"$@"
else
sudo "$@"
fi
}
CONFIG_FILE=/etc/hdd_temp_monitor.conf
SERVICE_FILE=/etc/systemd/system/hotdisk.service
TIMER_FILE=/etc/systemd/system/hotdisk.timer
echo "=== HotDisk Installation ==="
DEPENDENCIES=(bash smartctl curl lsblk awk date tee sudo systemctl)
DEPENDENCIES=(bash smartctl curl lsblk awk date tee systemctl)
MISSING=()
# Check dependencies - skip sudo if running as root
for cmd in "${DEPENDENCIES[@]}"; do
if ! command -v "$cmd" >/dev/null 2>&1; then MISSING+=("$cmd"); fi
done
# Only check for sudo if not running as root
if [[ $EUID -ne 0 ]] && ! command -v sudo >/dev/null 2>&1; then
MISSING+=("sudo")
fi
if [ ${#MISSING[@]} -ne 0 ]; then
echo "❌ Missing dependencies:"
for cmd in "${MISSING[@]}"; do echo " - $cmd"; done
@@ -17,49 +36,81 @@ fi
echo "✅ All dependencies are installed."
read -p "Maximum temperature (°C) before shutdown [60]: " MAX_TEMP
MAX_TEMP=${MAX_TEMP:-60}
if ! [[ "$MAX_TEMP" =~ ^[0-9]+$ ]] || [[ $MAX_TEMP -lt 1 || $MAX_TEMP -gt 100 ]]; then
echo "ERROR: MAX_TEMP must be a number between 1-100" >&2
exit 1
fi
read -p "Consecutive minutes above MAX_TEMP before shutdown [5]: " HOT_DURATION
HOT_DURATION=${HOT_DURATION:-5}
read -p "Consecutive minutes below MAX_TEMP to reset counter [5]: " COOL_DURATION
COOL_DURATION=${COOL_DURATION:-5}
if ! [[ "$HOT_DURATION" =~ ^[0-9]+$ ]] || [[ $HOT_DURATION -lt 1 ]]; then
echo "ERROR: HOT_DURATION must be a positive number" >&2
exit 1
fi
read -p "Minutes below MAX_TEMP to reset all counters [5]: " COOL_RESET_DURATION
COOL_RESET_DURATION=${COOL_RESET_DURATION:-5}
if ! [[ "$COOL_RESET_DURATION" =~ ^[0-9]+$ ]] || [[ $COOL_RESET_DURATION -lt 1 ]]; then
echo "ERROR: COOL_RESET_DURATION must be a positive number" >&2
exit 1
fi
read -p "Log file path [/var/log/hdd_temp_monitor.log]: " LOG_FILE
LOG_FILE=${LOG_FILE:-/var/log/hdd_temp_monitor.log}
read -p "Logrotate: number of files to keep [7]: " LOG_ROTATE_COUNT
LOG_ROTATE_COUNT=${LOG_ROTATE_COUNT:-7}
if ! [[ "$LOG_ROTATE_COUNT" =~ ^[0-9]+$ ]] || [[ $LOG_ROTATE_COUNT -lt 1 ]]; then
echo "ERROR: LOG_ROTATE_COUNT must be a positive number" >&2
exit 1
fi
read -p "Logrotate: rotation period (daily/weekly) [daily]: " LOG_ROTATE_PERIOD
LOG_ROTATE_PERIOD=${LOG_ROTATE_PERIOD:-daily}
if [[ ! "$LOG_ROTATE_PERIOD" =~ ^(daily|weekly)$ ]]; then
echo "ERROR: LOG_ROTATE_PERIOD must be 'daily' or 'weekly'" >&2
exit 1
fi
echo "Paste your Discord Webhook URL here."
read -p "Discord Webhook URL: " DISCORD_WEBHOOK
[ -z "$DISCORD_WEBHOOK" ] && { echo "Discord Webhook cannot be empty"; exit 1; }
if [[ -z "$DISCORD_WEBHOOK" ]]; then
echo "ERROR: Discord Webhook cannot be empty" >&2
exit 1
fi
# Validate Discord webhook URL format
if [[ ! "$DISCORD_WEBHOOK" =~ ^https://discord(app)?\.com/api/webhooks/ ]]; then
echo "ERROR: Invalid Discord webhook URL format" >&2
exit 1
fi
echo ""
echo "Please confirm:"
echo "MAX_TEMP=$MAX_TEMP"
echo "HOT_DURATION=$HOT_DURATION"
echo "COOL_DURATION=$COOL_DURATION"
echo "COOL_RESET_DURATION=$COOL_RESET_DURATION"
echo "LOG_FILE=$LOG_FILE"
echo "LOG_ROTATE_COUNT=$LOG_ROTATE_COUNT"
echo "LOG_ROTATE_PERIOD=$LOG_ROTATE_PERIOD"
echo "DISCORD_WEBHOOK=$DISCORD_WEBHOOK"
read -p "Is this correct? (y/n): " CONFIRM
[[ ! "$CONFIRM" =~ ^[Yy]$ ]] && { echo "Aborted"; exit 1; }
sudo tee "$CONFIG_FILE" > /dev/null <<EOF
run_as_root tee "$CONFIG_FILE" > /dev/null <<EOF
MAX_TEMP=$MAX_TEMP
HOT_DURATION=$HOT_DURATION
COOL_DURATION=$COOL_DURATION
COOL_RESET_DURATION=$COOL_RESET_DURATION
LOG_FILE=$LOG_FILE
LOG_ROTATE_COUNT=$LOG_ROTATE_COUNT
LOG_ROTATE_PERIOD=$LOG_ROTATE_PERIOD
DISCORD_WEBHOOK=$DISCORD_WEBHOOK
EOF
sudo chmod +x /usr/local/bin/sh/hotdisk.sh /usr/local/bin/sh/hotdisk_logger.sh
sudo /usr/local/bin/sh/hotdisk_logger.sh
sudo tee "$SERVICE_FILE" > /dev/null <<EOF
run_as_root chmod +x /usr/local/bin/hotdisk.sh /usr/local/bin/hotdisk_logger.sh
run_as_root /usr/local/bin/hotdisk_logger.sh
run_as_root tee "$SERVICE_FILE" > /dev/null <<EOF
[Unit]
Description=HotDisk SATA Temperature Check
[Service]
Type=oneshot
ExecStart=/usr/local/bin/sh/hotdisk.sh
ExecStart=/usr/local/bin/hotdisk.sh
EOF
sudo tee "$TIMER_FILE" > /dev/null <<EOF
run_as_root tee "$TIMER_FILE" > /dev/null <<EOF
[Unit]
Description=Run HotDisk temperature check every minute
[Timer]
@@ -69,7 +120,7 @@ Persistent=true
[Install]
WantedBy=timers.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now hotdisk.timer
sudo /usr/local/bin/sh/hotdisk.sh
run_as_root systemctl daemon-reload
run_as_root systemctl enable --now hotdisk.timer
run_as_root /usr/local/bin/hotdisk.sh
echo "✅ HotDisk installation complete!"