#!/bin/bash # set -x #========================================================= # our configuration #========================================================= MYNAME=alpha WEBURL="https://alpha.wyae.de/moshel" ALERTTO='root_alpha@wyae.de' WWWDIR=/home/www/alpha/moshel CMPDIR=/usr/local/lib/moshel/Compare # keep empty if you want checks to run, otherwise state short maintenance message MAINTENANCE="" # MAINTENANCE="Techniker ist informiert" mkdir -p $CMPDIR DATADIR=$WWWDIR/data/$MYNAME mkdir -p $DATADIR #========================================================= # Checks are named as "SHOULD BE" - so alert if exceeded #========================================================= . /usr/local/lib/moshel/functions.moshel Category "HardDiscs" #------------------------ CheckValueOver __MB_HDD_free 'df -m /' '/\/dev\//{ print $4 }' 20000 'Platte voll' CheckValueOver __GB_HDD_free 'df -m /' '/\/dev\//{ print $4 / 1024 }' 20 'Platte voll' # the following checks needs the "smartmontools" to be installed # Problem: smartctl keeps discs spinning?! # SSD-only CheckValueUnder sda_SSD_WearLevelingCount 'smartctl -A /dev/sda' '/Wear_Leveling_Count/{ print $10 }' 95 'SSD fast worn down' CheckValueUnder sda_SSD_ReservedBlocks 'smartctl -A /dev/sda' '/Used_Rsvd_Blk_Cnt_To/{ print $10 }' 100 'SSD-Reserve fast verbraucht' CheckValueUnder sda_SSD_ProgramFails 'smartctl -A /dev/sda' '/Program_Fail_Cnt_Total/{ print $10 }' 95 'SSD zu viele Program Fails' CheckValueUnder sda_SSD_EraseFails 'smartctl -A /dev/sda' '/Erase_Fail_Count_Total/{ print $10 }' 95 'SSD mit Schwierigkeiten beim Loeschen' CheckValueUnder sda_SSD_UncorrectableErrors 'smartctl -A /dev/sda' '/Uncorrectable_Error_Cnt/{ print $10 }' 95 'SSD mit zu vielen Fehlern' # SSD+HDD CheckValueUnder sda_Temperatur 'smartctl -A /dev/sda' '/194 Temperature_Celsius/{ print $10 }' 40 'Festplatte zu heiss' # HDD-only CheckValueUnder sdb_HDD_ReallocatedSectors 'smartctl -A /dev/sdb' '/Reallocated_Sector_Ct/{ print $10 }' 1 'Festplatte mit fehlerhaften Sektoren' CheckValueUnder sdb_HDD_PendingSector 'smartctl -A /dev/sdb' '/197 Current_Pending_Sector/{ print $10 }' 1 'Festplatte mit unbeschreibbaren Sektoren' CheckValueUnder sdb_HDD_OfflineUncorrectable 'smartctl -A /dev/sdb' '/198 Offline_Uncorrectable/{ print $10 }' 1 'Festplatte mit zu vielen Fehlern' Category "System Ressources" #------------------------ CheckValueOver CPU_idle 'vmstat 5 2' 'END {print $15}' 20 'CPU ausgelastet' CheckValueUnder CPU_IO-wait 'vmstat 5 2' 'END {print $16}' 20 'CPU wartet - IO ausgelastet' CheckValueUnder Load 'cat /proc/loadavg' '{ print $1 }' 1 'System-Load zu hoch' CheckValueUnder MB_RAM_used 'free -m' '/Mem:/{ print $3 }' 1000 'RAM voll' CheckValueOver GB_RAM_free 'cat /proc/meminfo' '/MemAvailable:/{ print $2 / 1000000 }' 3 'RAM voll' CheckCountLessThan Prozesse 'ps ax' '{print $0 }' 140 'zu viele Prozesse' CheckCountLessThan ZombieProzesse 'ps -A' '/defunct/{print $0 }' 1 'zu viele Zombie-Prozesse' # CPU/Board temperature CheckValueUnder Temperatur 'head -n 1 /sys/class/thermal/thermal_zone0/temp' '{ t = $0 / 1000 } END{ printf "%.1f", t }' 60 'Es wird zu heiss!' # APC UPS CheckValueOver USVtimeleft 'apcaccess -p TIMELEFT' '{ print $1 }' 2 'USV-Battery with only little time left' CheckValueOver USVbatteryloaded 'apcaccess -p BCHARGE' '{ print $1 }' 66 'USV-Battery insufficiently charged' Category "Services Listening" #------------------------ CheckCountMoreThan DovecotSieve_listening 'ss -tln' '/:4190 /{print $0 }' 0 'Dovecot ManageSieve not listening/running' CheckCountMoreThan DovecotIMAPS_listening 'ss -tln' '/:993 /{print $0 }' 0 'Dovecot IMAPS not listening/running' CheckCountMoreThan DovecotIMAP_listening 'ss -tln' '/:443 /{print $0 }' 0 'Dovecot IMAP not listening/running' CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:25 /{print $0 }' 0 'Postfix SMTP not listening/running' CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:465 /{print $0 }' 0 'Postfix SMTPS not listening/running' CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:587 /{print $0 }' 0 'Postfix Submit not listening/running' CheckCountMoreThan HTTPS_listening 'ss -tln' '/:443 /{print $0 }' 0 'HTTPS server not listening/running' CheckCountMoreThan HTTP_listening 'ss -tln' '/:80 /{print $0 }' 0 'HTTP server not listening/running' CheckCountMoreThan DNS_listening 'ss -tln' '/:53 /{print $0 }' 0 'DNS server not listening/running' CheckCountMoreThan MariaDB_listening 'ss -tln' '/:3306 /{print $0 }' 0 'MariaDB/MySQL server not listening/running' Category "MySQL / MariaDB checks" #------------------------ CheckValueUnder MySQL_Threads 'mysqladmin --user=status --password=status status' '{ print $4 }' 20 'Ziemlich viele MySQL-Threads' # bei aelteren Versionen an Position 22 # bitte Position pruefen mit "mysqladmin --user=status --password=status status" und dann abzaehlen CheckValueUnder MySQL_Queries 'mysqladmin --user=status --password=status status' '{ print $19 }' 5 'Ziemlich viele gleichzeitige MySQL-Querys' Category "Mail checks" #------------------------ CheckCountLessThan MailQueue 'mailq' '{print $0}' 20 'Mails stauen sich gerade auf unserem Mailer' CheckCountLessThan PostfixTLS_Incoming 'journalctl -u postfix@-.service --since "5 minute ago"' '/TLS connection established from /{print $0}' 50 'Viele TLS-Verbindungsversuche zu unserem Mailer' CheckCountLessThan PostfixTLS_Outgoing 'journalctl -u postfix@-.service --since "5 minute ago"' '/TLS connection established to /{print $0}' 20 'Viele ausgehende TLS-Verbindungsversuche auf unserem Mailer' CheckCountLessThan PostfixIncoming 'journalctl -u postfix@-.service --since "5 minute ago"' '/ connect from /{print $0}' 500 'Viele eingehende Mailversuche' CheckCountLessThan PostfixNOQUEUE 'journalctl -u postfix@-.service --since "5 minute ago"' '/ NOQUEUE: reject: /{print $0}' 500 'Ziemlich viel abzulehnen gerade' CheckCountLessThan PostfixSent 'journalctl -u postfix@-.service --since "5 minute ago"' '/ status=sent /{print $0}' 500 'Ziemlich viel ausgehende Mails' CheckCountLessThan DovecotStored 'journalctl -u dovecot --since "5 minute ago"' '/ mail saved to /{print $0}' 20 'Ziemlich viele abgelegte Mails' CheckCountLessThan DovecotSieved 'journalctl -u dovecot --since "5 minute ago"' '/ sieve: /{print $0}' 20 'Ziemlich viele gefilterte Mails' CheckCountLessThan DovecotLoginFailed 'journalctl -u dovecot --since "5 minute ago"' '/ imap-login: Disconnected \(auth failed, /{print $0}' 10 'Dovecot Bruteforce-Versuch!' Category "DNS+RBL checks" #------------------------ # you are on an RBL if Reversed IP + RBL-URL yield 127.0.0.* # here mail.wyae.de = 90.187.34.181 # for RBL in zen.spamhaus.org cbl.abuseat.org virbl.dnsbl.bit.nl dnsbl.inps.de ix.dnsbl.manitu.net no-more-funn.moensted.dk combined.njabl.org dnsbl.njabl.org bl.spamcannibal.org bl.spamcop.net dnsbl-1.uceprotect.net dsn.rfc-ignorant.org postmaster.rfc-ignorant.org bogusmx.rfc-ignorant.org; do CheckCountLessThan RBL_$RBL "dig +short 181.34.187.90.$RBL" '/127.0./{print $0}' 1 "Mailserver steht auf RBL $RBL" done CheckCountMoreThan DNS_mail.wyae.de_Provider "dig +short mail.wyae.de" '/90.187.34.181/{print $0}' 0 "Mailserver DNS beim Provider unbekannt" CheckCountMoreThan DNS_mail.wyae.de_Cloudflare "dig +short mail.wyae.de @1.1.1.1" '/90.187.34.181/{print $0}' 0 "Mailserver DNS bei Cloudflare unbekannt" CheckCountMoreThan DNS_mail.wyae.de_Google "dig +short mail.wyae.de @8.8.8.8" '/90.187.34.181/{print $0}' 0 "Mailserver DNS bei Google unbekannt" Category "Network checks" #------------------------ CheckValueUnder PingGoogle 'ping -n -c 3 -w 5 8.8.8.8' 'BEGIN {FS="/"}; /rtt min\/avg\/max\/mdev/{ print $5}' 30 'Ping dauert zu lang' CheckCountLessThan NetworkConnections 'ss -tn' '/ESTAB/{print $0}' 50 'Ziemlich viele offene TCP-Verbindungen' CheckCountMoreThan PHP_running 'curl -s --retry 1 http://SERVER/phpcheck.php' '/php REALLY works/{print $0 }' 0 'PHP kaputt? Lighttpd bitte neu starten.' CheckCountLessThan TLS-Certificate_SERVER 'curl -I --retry 1 https://SERVER/mosshecheck.txt --stderr -' '/SSL certificate problem/{print $0 }' 1 'TLS Zertifikat abgelaufen!' Category "IoCs" #------------------------ CheckCountLessThan InteraktivesPTS 'ps ax' '/pts\//{print $0 }' 1 'Nutzer mit interaktivem Login' CheckCountLessThan User 'w' '/t/{print $0 }' 1 'Interaktive Nutzer auf dem System!' # run "./generate_compares.sh" in /usr/local/lib/moshel directory CheckFileChanges resolv.conf /etc/resolv.conf CheckFileChanges passwd /etc/passwd CheckFileChanges shadow /etc/shadow CheckFileChanges authorized_keys /root/.ssh/authorized_keys Category "QNAP NAS checks via SNMP" # change SNMPCOMMUNITY to your SNMP community string, and replace IPADDRESS with your NAS' IP address #------------------------ ### ideas from https://github.com/nikband/check_qnap3.sh/blob/master/check_qnap3.sh CheckCountMoreThan NASvolume_1_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.1' '/STRING: "Ready"/' 0 'Volume offline' CheckValueOver NASvolume_1_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.1' '{print substr($4,2,9)}' 50 'Volume voll' CheckCountMoreThan NASvolume_2_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.2' '/STRING: "Ready"/' 0 'Volume offline' #CheckCountMoreThan NASvolume_2_typ_TB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.2' '/ TB"/{print $5}' 0 "Volume hat nur noch GByte frei, nicht mehr TB!" CheckValueOver NASvolume_2_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.2' '{print substr($4,2,9)}' 500 'Volume voll' CheckCountMoreThan NASvolume_3_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.3' '/STRING: "Ready"/' 0 'Volume offline' CheckCountMoreThan NASvolume_3_typ_TB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.3' '/ TB"/{print $5}' 0 "Volume hat nur noch GByte frei, nicht mehr TB!" #CheckValueOver NASvolume_3_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.3' '{print substr($4,2,9)}' 500 'Volume voll' CheckValueUnder NAS_CPU_usage 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.1.0' '{print substr($4,2,9)}' 90 'CPU ausgelastet' CheckValueOver NAS_RAM_free 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.2.0' '{print substr($4,2,9)}' 200 'RAM voll' CheckValueUnder NAS_Temp_usage 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.6.0' '{print substr($4,2,4)}' 50 'System zu heiss' CheckValueUnder NAS_Fan_RPM_high 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.15.1.3.1' '{print substr($4,2,4)}' 1000 'Luefter dreht zu hoch' CheckValueOver NAS_Fan_RPM_low 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.15.1.3.1' '{print substr($4,2,4)}' 500 'Luefter dreht zu niedrig' CheckCountMoreThan NAS_HDD1_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.4.1' '/STRING: "Good"/{print $0}' 0 'HDD offline' CheckValueUnder NAS_HDD1_Temp 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.6.1' '{print $4}' 50 'HDD zu heiss' CheckCountMoreThan NAS_HDD2_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.4.2' '/STRING: "Good"/{print $0}' 0 'HDD offline' CheckValueUnder NAS_HDD2_Temp 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.6.2' '{print $4}' 50 'HDD zu heiss' Category "Ambient Sensors" #------------------------ # most sensors basically never should trigger, but are used to collect data FN=$( mktemp ) # generate a log file with a service running # rtl_433 -C si -F csv:/var/log/rtl_433.log" fgrep 'MATCH_SENSOR_STRING' /var/log/rtl_433.log \ | tail -n 1 | cut -d ',' -f 9 > $FN CheckValueOver AirTemperature "cat $FN" '$0' 1 'Danger to the garden! Freezing temperatures' # "alert" check only to log data fgrep 'MATCH_SENSOR_STRING' /var/log/rtl_433.log \ | tail -n 1 | cut -d ',' -f 13 > $FN CheckValueUnder AirHumidity "cat $FN" '$0' 100 'Sensor broken?' # retrieve power usage/solar generation from an OpenDTU #-- *old* OpenDTU v24.1.26 curl --retry-max-time 9 --retry 2 -s http://IPADDRESS/api/livedata/status | jq .inverters[0].AC.\"0\".Power.v | cut -d '.' -f 1 > $FN #-- after OpenDTU v24.2.12 curl --retry-max-time 9 --retry 2 -s http://IPADDRESS/api/livedata/status | jq .total.Power.v | cut -d '.' -f 1 > $FN CheckValueUnder Solarstrom-OpenDTU "cat $FN" '{ print $0 }' 800 'Solar generation above 800W' # retrieve power usage/solar generation from a Shelly (Gen1-API) curl --retry-max-time 9 --retry 2 -s -u USER:PASSWORD --basic http://IPADDRESS/status | jq '.meters[0].power' > $FN CheckValueUnder Solarstrom-Shelly1pm "cat $FN" '{ print $0 }' 800 'Solar generation above 800W' # retrieve power usage/solar generation from a ShellyPlugS (Gen2-API) curl --retry-max-time 9 --retry 2 -s -u USER:PASSWORD --anyauth 'http://IPADDRESS/rpc/Switch.GetStatus?id=0' | jq .apower > $FN CheckValueUnder Solarstrom-ShellyPlugS "cat $FN" '{ print $0 }' 800 'Solar generation above 800W' rm $FN #========================================================= # Summary #========================================================= # CURLAUTH=" --digest --user USERNAME:PASSWORD" Centralize https://SERVER1/moshel http://SERVER22/moshel http://SERVER333/moshel CleanUp # restart the httpd if a PHP check fails (for the first time) ActionOnAlert PHP_running 'systemctl restart lighttpd' ############################################################################# # MoSheL: remote server monitoring environment # # Copyright (C) 2020- Volker Tanger # # Licensed under the EUROPEAN UNION PUBLIC LICENCE v. 1.2 (or later) # This is free software - see attached file LICENSE_EUPL-1.2_EN.txt # and available in other languages under # https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 # # For bug reports and suggestions or if you just want to talk to me please # contact me at volker.tanger@wyae.de # # Updates will be available at https://www.wyae.de/software/moshel/ # please check there for updates prior to submitting patches! # # For list of changes please refer to the HISTORY file. Thanks. #############################################################################