123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- #!/bin/bash
- # set -x
- #=========================================================
- # our configuration
- #=========================================================
- MYNAME=alpha
- WEBURL="https://alpha.wyae.de/moshel"
- ALERTTO='root_alpha@wyae.de'
- WWWDIR=/home/www/alpha/moshel
- CMPDIR=/usr/local/lib/moshel/Compare
- # keep empty if you want checks to run, otherwise state short maintenance message
- MAINTENANCE=""
- # MAINTENANCE="Techniker ist informiert"
- mkdir -p $CMPDIR
- DATADIR=$WWWDIR/data/$MYNAME
- mkdir -p $DATADIR
- #=========================================================
- # Checks are named as "SHOULD BE" - so alert if exceeded
- #=========================================================
- . /usr/local/lib/moshel/functions.moshel
- Category "HardDiscs"
- #------------------------
- CheckValueOver __MB_HDD_free 'df -m /' '/\/dev\//{ print $4 }' 20000 'Platte voll'
- CheckValueOver __GB_HDD_free 'df -m /' '/\/dev\//{ print $4 / 1024 }' 20 'Platte voll'
- # the following checks needs the "smartmontools" to be installed
- # Problem: smartctl keeps discs spinning?!
- # SSD-only
- CheckValueUnder sda_SSD_WearLevelingCount 'smartctl -A /dev/sda' '/Wear_Leveling_Count/{ print $10 }' 95 'SSD fast worn down'
- CheckValueUnder sda_SSD_ReservedBlocks 'smartctl -A /dev/sda' '/Used_Rsvd_Blk_Cnt_To/{ print $10 }' 100 'SSD-Reserve fast verbraucht'
- CheckValueUnder sda_SSD_ProgramFails 'smartctl -A /dev/sda' '/Program_Fail_Cnt_Total/{ print $10 }' 95 'SSD zu viele Program Fails'
- CheckValueUnder sda_SSD_EraseFails 'smartctl -A /dev/sda' '/Erase_Fail_Count_Total/{ print $10 }' 95 'SSD mit Schwierigkeiten beim Loeschen'
- CheckValueUnder sda_SSD_UncorrectableErrors 'smartctl -A /dev/sda' '/Uncorrectable_Error_Cnt/{ print $10 }' 95 'SSD mit zu vielen Fehlern'
- # SSD+HDD
- CheckValueUnder sda_Temperatur 'smartctl -A /dev/sda' '/194 Temperature_Celsius/{ print $10 }' 40 'Festplatte zu heiss'
- # HDD-only
- CheckValueUnder sdb_HDD_ReallocatedSectors 'smartctl -A /dev/sdb' '/Reallocated_Sector_Ct/{ print $10 }' 1 'Festplatte mit fehlerhaften Sektoren'
- CheckValueUnder sdb_HDD_PendingSector 'smartctl -A /dev/sdb' '/197 Current_Pending_Sector/{ print $10 }' 1 'Festplatte mit unbeschreibbaren Sektoren'
- CheckValueUnder sdb_HDD_OfflineUncorrectable 'smartctl -A /dev/sdb' '/198 Offline_Uncorrectable/{ print $10 }' 1 'Festplatte mit zu vielen Fehlern'
- Category "System Ressources"
- #------------------------
- CheckValueOver CPU_idle 'vmstat 5 2' 'END {print $15}' 20 'CPU ausgelastet'
- CheckValueUnder CPU_IO-wait 'vmstat 5 2' 'END {print $16}' 20 'CPU wartet - IO ausgelastet'
- CheckValueUnder Load 'cat /proc/loadavg' '{ print $1 }' 1 'System-Load zu hoch'
- CheckValueUnder MB_RAM_used 'free -m' '/Mem:/{ print $3 }' 1000 'RAM voll'
- CheckValueOver GB_RAM_free 'cat /proc/meminfo' '/MemAvailable:/{ print $2 / 1000000 }' 3 'RAM voll'
- CheckCountLessThan Prozesse 'ps ax' '{print $0 }' 140 'zu viele Prozesse'
- CheckCountLessThan ZombieProzesse 'ps -A' '/defunct/{print $0 }' 1 'zu viele Zombie-Prozesse'
- # CPU/Board temperature
- CheckValueUnder Temperatur 'head -n 1 /sys/class/thermal/thermal_zone0/temp' '{ t = $0 / 1000 } END{ printf "%.1f", t }' 60 'Es wird zu heiss!'
- # APC UPS
- CheckValueOver USVtimeleft 'apcaccess -p TIMELEFT' '{ print $1 }' 2 'USV-Battery with only little time left'
- CheckValueOver USVbatteryloaded 'apcaccess -p BCHARGE' '{ print $1 }' 66 'USV-Battery insufficiently charged'
- Category "Services Listening"
- #------------------------
- CheckCountMoreThan DovecotSieve_listening 'ss -tln' '/:4190 /{print $0 }' 0 'Dovecot ManageSieve not listening/running'
- CheckCountMoreThan DovecotIMAPS_listening 'ss -tln' '/:993 /{print $0 }' 0 'Dovecot IMAPS not listening/running'
- CheckCountMoreThan DovecotIMAP_listening 'ss -tln' '/:443 /{print $0 }' 0 'Dovecot IMAP not listening/running'
- CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:25 /{print $0 }' 0 'Postfix SMTP not listening/running'
- CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:465 /{print $0 }' 0 'Postfix SMTPS not listening/running'
- CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:587 /{print $0 }' 0 'Postfix Submit not listening/running'
- CheckCountMoreThan HTTPS_listening 'ss -tln' '/:443 /{print $0 }' 0 'HTTPS server not listening/running'
- CheckCountMoreThan HTTP_listening 'ss -tln' '/:80 /{print $0 }' 0 'HTTP server not listening/running'
- CheckCountMoreThan DNS_listening 'ss -tln' '/:53 /{print $0 }' 0 'DNS server not listening/running'
- CheckCountMoreThan MariaDB_listening 'ss -tln' '/:3306 /{print $0 }' 0 'MariaDB/MySQL server not listening/running'
- Category "MySQL / MariaDB checks"
- #------------------------
- CheckValueUnder MySQL_Threads 'mysqladmin --user=status --password=status status' '{ print $4 }' 20 'Ziemlich viele MySQL-Threads'
- # bei aelteren Versionen an Position 22
- # bitte Position pruefen mit "mysqladmin --user=status --password=status status" und dann abzaehlen
- CheckValueUnder MySQL_Queries 'mysqladmin --user=status --password=status status' '{ print $19 }' 5 'Ziemlich viele gleichzeitige MySQL-Querys'
- Category "Mail checks"
- #------------------------
- CheckCountLessThan MailQueue 'mailq' '{print $0}' 20 'Mails stauen sich gerade auf unserem Mailer'
- CheckCountLessThan PostfixTLS_Incoming 'journalctl -u postfix@-.service --since "5 minute ago"' '/TLS connection established from /{print $0}' 50 'Viele TLS-Verbindungsversuche zu unserem Mailer'
- CheckCountLessThan PostfixTLS_Outgoing 'journalctl -u postfix@-.service --since "5 minute ago"' '/TLS connection established to /{print $0}' 20 'Viele ausgehende TLS-Verbindungsversuche auf unserem Mailer'
- CheckCountLessThan PostfixIncoming 'journalctl -u postfix@-.service --since "5 minute ago"' '/ connect from /{print $0}' 500 'Viele eingehende Mailversuche'
- CheckCountLessThan PostfixNOQUEUE 'journalctl -u postfix@-.service --since "5 minute ago"' '/ NOQUEUE: reject: /{print $0}' 500 'Ziemlich viel abzulehnen gerade'
- CheckCountLessThan PostfixSent 'journalctl -u postfix@-.service --since "5 minute ago"' '/ status=sent /{print $0}' 500 'Ziemlich viel ausgehende Mails'
- CheckCountLessThan DovecotStored 'journalctl -u dovecot --since "5 minute ago"' '/ mail saved to /{print $0}' 20 'Ziemlich viele abgelegte Mails'
- CheckCountLessThan DovecotSieved 'journalctl -u dovecot --since "5 minute ago"' '/ sieve: /{print $0}' 20 'Ziemlich viele gefilterte Mails'
- CheckCountLessThan DovecotLoginFailed 'journalctl -u dovecot --since "5 minute ago"' '/ imap-login: Disconnected \(auth failed, /{print $0}' 10 'Dovecot Bruteforce-Versuch!'
- Category "DNS+RBL checks"
- #------------------------
- # you are on an RBL if Reversed IP + RBL-URL yield 127.0.0.*
- # here mail.wyae.de = 90.187.34.181
- #
- for RBL in zen.spamhaus.org cbl.abuseat.org virbl.dnsbl.bit.nl dnsbl.inps.de ix.dnsbl.manitu.net no-more-funn.moensted.dk combined.njabl.org dnsbl.njabl.org bl.spamcannibal.org bl.spamcop.net dnsbl-1.uceprotect.net dsn.rfc-ignorant.org postmaster.rfc-ignorant.org bogusmx.rfc-ignorant.org; do
- CheckCountLessThan RBL_$RBL "dig +short 181.34.187.90.$RBL" '/127.0./{print $0}' 1 "Mailserver steht auf RBL $RBL"
- done
- CheckCountMoreThan DNS_mail.wyae.de_Provider "dig +short mail.wyae.de" '/90.187.34.181/{print $0}' 0 "Mailserver DNS beim Provider unbekannt"
- CheckCountMoreThan DNS_mail.wyae.de_Cloudflare "dig +short mail.wyae.de @1.1.1.1" '/90.187.34.181/{print $0}' 0 "Mailserver DNS bei Cloudflare unbekannt"
- CheckCountMoreThan DNS_mail.wyae.de_Google "dig +short mail.wyae.de @8.8.8.8" '/90.187.34.181/{print $0}' 0 "Mailserver DNS bei Google unbekannt"
- Category "Network checks"
- #------------------------
- CheckValueUnder PingGoogle 'ping -n -c 3 -w 5 8.8.8.8' 'BEGIN {FS="/"}; /rtt min\/avg\/max\/mdev/{ print $5}' 30 'Ping dauert zu lang'
- CheckCountLessThan NetworkConnections 'ss -tn' '/ESTAB/{print $0}' 50 'Ziemlich viele offene TCP-Verbindungen'
- CheckCountMoreThan PHP_running 'curl -s --retry 1 http://SERVER/phpcheck.php' '/php REALLY works/{print $0 }' 0 'PHP kaputt? Lighttpd bitte neu starten.'
- CheckCountLessThan TLS-Certificate_SERVER 'curl -I --retry 1 https://SERVER/mosshecheck.txt --stderr -' '/SSL certificate problem/{print $0 }' 1 'TLS Zertifikat abgelaufen!'
- Category "IoCs"
- #------------------------
- CheckCountLessThan InteraktivesPTS 'ps ax' '/pts\//{print $0 }' 1 'Nutzer mit interaktivem Login'
- CheckCountLessThan User 'w' '/t/{print $0 }' 1 'Interaktive Nutzer auf dem System!'
- # run "./generate_compares.sh" in /usr/local/lib/moshel directory
- CheckFileChanges resolv.conf /etc/resolv.conf
- CheckFileChanges passwd /etc/passwd
- CheckFileChanges shadow /etc/shadow
- CheckFileChanges authorized_keys /root/.ssh/authorized_keys
- Category "QNAP NAS checks via SNMP"
- # change SNMPCOMMUNITY to your SNMP community string, and replace IPADDRESS with your NAS' IP address
- #------------------------
- ### ideas from https://github.com/nikband/check_qnap3.sh/blob/master/check_qnap3.sh
- CheckCountMoreThan NASvolume_1_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.1' '/STRING: "Ready"/' 0 'Volume offline'
- CheckValueOver NASvolume_1_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.1' '{print substr($4,2,9)}' 50 'Volume voll'
- CheckCountMoreThan NASvolume_2_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.2' '/STRING: "Ready"/' 0 'Volume offline'
- #CheckCountMoreThan NASvolume_2_typ_TB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.2' '/ TB"/{print $5}' 0 "Volume hat nur noch GByte frei, nicht mehr TB!"
- CheckValueOver NASvolume_2_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.2' '{print substr($4,2,9)}' 500 'Volume voll'
- CheckCountMoreThan NASvolume_3_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.3' '/STRING: "Ready"/' 0 'Volume offline'
- CheckCountMoreThan NASvolume_3_typ_TB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.3' '/ TB"/{print $5}' 0 "Volume hat nur noch GByte frei, nicht mehr TB!"
- #CheckValueOver NASvolume_3_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.3' '{print substr($4,2,9)}' 500 'Volume voll'
- CheckValueUnder NAS_CPU_usage 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.1.0' '{print substr($4,2,9)}' 90 'CPU ausgelastet'
- CheckValueOver NAS_RAM_free 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.2.0' '{print substr($4,2,9)}' 200 'RAM voll'
- CheckValueUnder NAS_Temp_usage 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.6.0' '{print substr($4,2,4)}' 50 'System zu heiss'
- CheckValueUnder NAS_Fan_RPM_high 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.15.1.3.1' '{print substr($4,2,4)}' 1000 'Luefter dreht zu hoch'
- CheckValueOver NAS_Fan_RPM_low 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.15.1.3.1' '{print substr($4,2,4)}' 500 'Luefter dreht zu niedrig'
- CheckCountMoreThan NAS_HDD1_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.4.1' '/STRING: "Good"/{print $0}' 0 'HDD offline'
- CheckValueUnder NAS_HDD1_Temp 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.6.1' '{print $4}' 50 'HDD zu heiss'
- CheckCountMoreThan NAS_HDD2_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.4.2' '/STRING: "Good"/{print $0}' 0 'HDD offline'
- CheckValueUnder NAS_HDD2_Temp 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.6.2' '{print $4}' 50 'HDD zu heiss'
- Category "Ambient Sensors"
- #------------------------
- # most sensors basically never should trigger, but are used to collect data
- FN=$( mktemp )
- # generate a log file with a service running
- # rtl_433 -C si -F csv:/var/log/rtl_433.log"
- fgrep 'MATCH_SENSOR_STRING' /var/log/rtl_433.log \
- | tail -n 1 | cut -d ',' -f 9 > $FN
- CheckValueOver AirTemperature "cat $FN" '$0' 1 'Danger to the garden! Freezing temperatures'
- # "alert" check only to log data
- fgrep 'MATCH_SENSOR_STRING' /var/log/rtl_433.log \
- | tail -n 1 | cut -d ',' -f 13 > $FN
- CheckValueUnder AirHumidity "cat $FN" '$0' 100 'Sensor broken?'
- # retrieve power usage/solar generation from an OpenDTU
- #-- *old* OpenDTU v24.1.26
- curl --retry-max-time 9 --retry 2 -s http://IPADDRESS/api/livedata/status | jq .inverters[0].AC.\"0\".Power.v | cut -d '.' -f 1 > $FN
- #-- after OpenDTU v24.2.12
- curl --retry-max-time 9 --retry 2 -s http://IPADDRESS/api/livedata/status | jq .total.Power.v | cut -d '.' -f 1 > $FN
- CheckValueUnder Solarstrom-OpenDTU "cat $FN" '{ print $0 }' 800 'Solar generation above 800W'
- # retrieve power usage/solar generation from a Shelly (Gen1-API)
- curl --retry-max-time 9 --retry 2 -s -u USER:PASSWORD --basic http://IPADDRESS/status | jq '.meters[0].power' > $FN
- CheckValueUnder Solarstrom-Shelly1pm "cat $FN" '{ print $0 }' 800 'Solar generation above 800W'
- # retrieve power usage/solar generation from a ShellyPlugS (Gen2-API)
- curl --retry-max-time 9 --retry 2 -s -u USER:PASSWORD --anyauth 'http://IPADDRESS/rpc/Switch.GetStatus?id=0' | jq .apower > $FN
- CheckValueUnder Solarstrom-ShellyPlugS "cat $FN" '{ print $0 }' 800 'Solar generation above 800W'
- rm $FN
- #=========================================================
- # Summary
- #=========================================================
- # CURLAUTH=" --digest --user USERNAME:PASSWORD"
- Centralize https://SERVER1/moshel http://SERVER22/moshel http://SERVER333/moshel
- CleanUp
- # restart the httpd if a PHP check fails (for the first time)
- ActionOnAlert PHP_running 'systemctl restart lighttpd'
- #############################################################################
- # MoSheL: remote server monitoring environment
- #
- # Copyright (C) 2020- Volker Tanger
- #
- # Licensed under the EUROPEAN UNION PUBLIC LICENCE v. 1.2 (or later)
- # This is free software - see attached file LICENSE_EUPL-1.2_EN.txt
- # and available in other languages under
- # https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12
- #
- # For bug reports and suggestions or if you just want to talk to me please
- # contact me at volker.tanger@wyae.de
- #
- # Updates will be available at https://www.wyae.de/software/moshel/
- # please check there for updates prior to submitting patches!
- #
- # For list of changes please refer to the HISTORY file. Thanks.
- #############################################################################
|