moshel.example 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. #!/bin/bash
  2. # set -x
  3. #=========================================================
  4. # our configuration
  5. #=========================================================
  6. MYNAME=alpha
  7. WEBURL="https://alpha.wyae.de/moshel"
  8. ALERTTO='root_alpha@wyae.de'
  9. WWWDIR=/home/www/alpha/moshel
  10. CMPDIR=/usr/local/lib/moshel/Compare
  11. # keep empty if you want checks to run, otherwise state short maintenance message
  12. MAINTENANCE=""
  13. # MAINTENANCE="Techniker ist informiert"
  14. mkdir -p $CMPDIR
  15. DATADIR=$WWWDIR/data/$MYNAME
  16. mkdir -p $DATADIR
  17. #=========================================================
  18. # Checks are named as "SHOULD BE" - so alert if exceeded
  19. #=========================================================
  20. . /usr/local/lib/moshel/functions.moshel
  21. Category "HardDiscs"
  22. #------------------------
  23. CheckValueOver __MB_HDD_free 'df -m /' '/\/dev\//{ print $4 }' 20000 'Platte voll'
  24. CheckValueOver __GB_HDD_free 'df -m /' '/\/dev\//{ print $4 / 1024 }' 20 'Platte voll'
  25. # the following checks needs the "smartmontools" to be installed
  26. # Problem: smartctl keeps discs spinning?!
  27. # SSD-only
  28. CheckValueUnder sda_SSD_WearLevelingCount 'smartctl -A /dev/sda' '/Wear_Leveling_Count/{ print $10 }' 95 'SSD fast worn down'
  29. CheckValueUnder sda_SSD_ReservedBlocks 'smartctl -A /dev/sda' '/Used_Rsvd_Blk_Cnt_To/{ print $10 }' 100 'SSD-Reserve fast verbraucht'
  30. CheckValueUnder sda_SSD_ProgramFails 'smartctl -A /dev/sda' '/Program_Fail_Cnt_Total/{ print $10 }' 95 'SSD zu viele Program Fails'
  31. CheckValueUnder sda_SSD_EraseFails 'smartctl -A /dev/sda' '/Erase_Fail_Count_Total/{ print $10 }' 95 'SSD mit Schwierigkeiten beim Loeschen'
  32. CheckValueUnder sda_SSD_UncorrectableErrors 'smartctl -A /dev/sda' '/Uncorrectable_Error_Cnt/{ print $10 }' 95 'SSD mit zu vielen Fehlern'
  33. # SSD+HDD
  34. CheckValueUnder sda_Temperatur 'smartctl -A /dev/sda' '/194 Temperature_Celsius/{ print $10 }' 40 'Festplatte zu heiss'
  35. # HDD-only
  36. CheckValueUnder sdb_HDD_ReallocatedSectors 'smartctl -A /dev/sdb' '/Reallocated_Sector_Ct/{ print $10 }' 1 'Festplatte mit fehlerhaften Sektoren'
  37. CheckValueUnder sdb_HDD_PendingSector 'smartctl -A /dev/sdb' '/197 Current_Pending_Sector/{ print $10 }' 1 'Festplatte mit unbeschreibbaren Sektoren'
  38. CheckValueUnder sdb_HDD_OfflineUncorrectable 'smartctl -A /dev/sdb' '/198 Offline_Uncorrectable/{ print $10 }' 1 'Festplatte mit zu vielen Fehlern'
  39. Category "System Ressources"
  40. #------------------------
  41. CheckValueOver CPU_idle 'vmstat 5 2' 'END {print $15}' 20 'CPU ausgelastet'
  42. CheckValueUnder CPU_IO-wait 'vmstat 5 2' 'END {print $16}' 20 'CPU wartet - IO ausgelastet'
  43. CheckValueUnder Load 'cat /proc/loadavg' '{ print $1 }' 1 'System-Load zu hoch'
  44. CheckValueUnder MB_RAM_used 'free -m' '/Mem:/{ print $3 }' 1000 'RAM voll'
  45. CheckValueOver GB_RAM_free 'cat /proc/meminfo' '/MemAvailable:/{ print $2 / 1000000 }' 3 'RAM voll'
  46. CheckCountLessThan Prozesse 'ps ax' '{print $0 }' 140 'zu viele Prozesse'
  47. CheckCountLessThan ZombieProzesse 'ps -A' '/defunct/{print $0 }' 1 'zu viele Zombie-Prozesse'
  48. # CPU/Board temperature
  49. CheckValueUnder Temperatur 'head -n 1 /sys/class/thermal/thermal_zone0/temp' '{ t = $0 / 1000 } END{ printf "%.1f", t }' 60 'Es wird zu heiss!'
  50. # APC UPS
  51. CheckValueOver USVtimeleft 'apcaccess -p TIMELEFT' '{ print $1 }' 2 'USV-Battery with only little time left'
  52. CheckValueOver USVbatteryloaded 'apcaccess -p BCHARGE' '{ print $1 }' 66 'USV-Battery insufficiently charged'
  53. Category "Services Listening"
  54. #------------------------
  55. CheckCountMoreThan DovecotSieve_listening 'ss -tln' '/:4190 /{print $0 }' 0 'Dovecot ManageSieve not listening/running'
  56. CheckCountMoreThan DovecotIMAPS_listening 'ss -tln' '/:993 /{print $0 }' 0 'Dovecot IMAPS not listening/running'
  57. CheckCountMoreThan DovecotIMAP_listening 'ss -tln' '/:443 /{print $0 }' 0 'Dovecot IMAP not listening/running'
  58. CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:25 /{print $0 }' 0 'Postfix SMTP not listening/running'
  59. CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:465 /{print $0 }' 0 'Postfix SMTPS not listening/running'
  60. CheckCountMoreThan PostfixSMTP_listening 'ss -tln' '/:587 /{print $0 }' 0 'Postfix Submit not listening/running'
  61. CheckCountMoreThan HTTPS_listening 'ss -tln' '/:443 /{print $0 }' 0 'HTTPS server not listening/running'
  62. CheckCountMoreThan HTTP_listening 'ss -tln' '/:80 /{print $0 }' 0 'HTTP server not listening/running'
  63. CheckCountMoreThan DNS_listening 'ss -tln' '/:53 /{print $0 }' 0 'DNS server not listening/running'
  64. CheckCountMoreThan MariaDB_listening 'ss -tln' '/:3306 /{print $0 }' 0 'MariaDB/MySQL server not listening/running'
  65. Category "MySQL / MariaDB checks"
  66. #------------------------
  67. CheckValueUnder MySQL_Threads 'mysqladmin --user=status --password=status status' '{ print $4 }' 20 'Ziemlich viele MySQL-Threads'
  68. # bei aelteren Versionen an Position 22
  69. # bitte Position pruefen mit "mysqladmin --user=status --password=status status" und dann abzaehlen
  70. CheckValueUnder MySQL_Queries 'mysqladmin --user=status --password=status status' '{ print $19 }' 5 'Ziemlich viele gleichzeitige MySQL-Querys'
  71. Category "Mail checks"
  72. #------------------------
  73. CheckCountLessThan MailQueue 'mailq' '{print $0}' 20 'Mails stauen sich gerade auf unserem Mailer'
  74. CheckCountLessThan PostfixTLS_Incoming 'journalctl -u postfix@-.service --since "5 minute ago"' '/TLS connection established from /{print $0}' 50 'Viele TLS-Verbindungsversuche zu unserem Mailer'
  75. CheckCountLessThan PostfixTLS_Outgoing 'journalctl -u postfix@-.service --since "5 minute ago"' '/TLS connection established to /{print $0}' 20 'Viele ausgehende TLS-Verbindungsversuche auf unserem Mailer'
  76. CheckCountLessThan PostfixIncoming 'journalctl -u postfix@-.service --since "5 minute ago"' '/ connect from /{print $0}' 500 'Viele eingehende Mailversuche'
  77. CheckCountLessThan PostfixNOQUEUE 'journalctl -u postfix@-.service --since "5 minute ago"' '/ NOQUEUE: reject: /{print $0}' 500 'Ziemlich viel abzulehnen gerade'
  78. CheckCountLessThan PostfixSent 'journalctl -u postfix@-.service --since "5 minute ago"' '/ status=sent /{print $0}' 500 'Ziemlich viel ausgehende Mails'
  79. CheckCountLessThan DovecotStored 'journalctl -u dovecot --since "5 minute ago"' '/ mail saved to /{print $0}' 20 'Ziemlich viele abgelegte Mails'
  80. CheckCountLessThan DovecotSieved 'journalctl -u dovecot --since "5 minute ago"' '/ sieve: /{print $0}' 20 'Ziemlich viele gefilterte Mails'
  81. CheckCountLessThan DovecotLoginFailed 'journalctl -u dovecot --since "5 minute ago"' '/ imap-login: Disconnected \(auth failed, /{print $0}' 10 'Dovecot Bruteforce-Versuch!'
  82. Category "DNS+RBL checks"
  83. #------------------------
  84. # you are on an RBL if Reversed IP + RBL-URL yield 127.0.0.*
  85. # here mail.wyae.de = 90.187.34.181
  86. #
  87. for RBL in zen.spamhaus.org cbl.abuseat.org virbl.dnsbl.bit.nl dnsbl.inps.de ix.dnsbl.manitu.net no-more-funn.moensted.dk combined.njabl.org dnsbl.njabl.org bl.spamcannibal.org bl.spamcop.net dnsbl-1.uceprotect.net dsn.rfc-ignorant.org postmaster.rfc-ignorant.org bogusmx.rfc-ignorant.org; do
  88. CheckCountLessThan RBL_$RBL "dig +short 181.34.187.90.$RBL" '/127.0./{print $0}' 1 "Mailserver steht auf RBL $RBL"
  89. done
  90. CheckCountMoreThan DNS_mail.wyae.de_Provider "dig +short mail.wyae.de" '/90.187.34.181/{print $0}' 0 "Mailserver DNS beim Provider unbekannt"
  91. CheckCountMoreThan DNS_mail.wyae.de_Cloudflare "dig +short mail.wyae.de @1.1.1.1" '/90.187.34.181/{print $0}' 0 "Mailserver DNS bei Cloudflare unbekannt"
  92. CheckCountMoreThan DNS_mail.wyae.de_Google "dig +short mail.wyae.de @8.8.8.8" '/90.187.34.181/{print $0}' 0 "Mailserver DNS bei Google unbekannt"
  93. Category "Network checks"
  94. #------------------------
  95. CheckValueUnder PingGoogle 'ping -n -c 3 -w 5 8.8.8.8' 'BEGIN {FS="/"}; /rtt min\/avg\/max\/mdev/{ print $5}' 30 'Ping dauert zu lang'
  96. CheckCountLessThan NetworkConnections 'ss -tn' '/ESTAB/{print $0}' 50 'Ziemlich viele offene TCP-Verbindungen'
  97. CheckCountMoreThan PHP_running 'curl -s --retry 1 http://SERVER/phpcheck.php' '/php REALLY works/{print $0 }' 0 'PHP kaputt? Lighttpd bitte neu starten.'
  98. CheckCountLessThan TLS-Certificate_SERVER 'curl -I --retry 1 https://SERVER/mosshecheck.txt --stderr -' '/SSL certificate problem/{print $0 }' 1 'TLS Zertifikat abgelaufen!'
  99. Category "IoCs"
  100. #------------------------
  101. CheckCountLessThan InteraktivesPTS 'ps ax' '/pts\//{print $0 }' 1 'Nutzer mit interaktivem Login'
  102. CheckCountLessThan User 'w' '/t/{print $0 }' 1 'Interaktive Nutzer auf dem System!'
  103. # run "./generate_compares.sh" in /usr/local/lib/moshel directory
  104. CheckFileChanges resolv.conf /etc/resolv.conf
  105. CheckFileChanges passwd /etc/passwd
  106. CheckFileChanges shadow /etc/shadow
  107. CheckFileChanges authorized_keys /root/.ssh/authorized_keys
  108. Category "QNAP NAS checks via SNMP"
  109. # change SNMPCOMMUNITY to your SNMP community string, and replace IPADDRESS with your NAS' IP address
  110. #------------------------
  111. ### ideas from https://github.com/nikband/check_qnap3.sh/blob/master/check_qnap3.sh
  112. CheckCountMoreThan NASvolume_1_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.1' '/STRING: "Ready"/' 0 'Volume offline'
  113. CheckValueOver NASvolume_1_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.1' '{print substr($4,2,9)}' 50 'Volume voll'
  114. CheckCountMoreThan NASvolume_2_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.2' '/STRING: "Ready"/' 0 'Volume offline'
  115. #CheckCountMoreThan NASvolume_2_typ_TB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.2' '/ TB"/{print $5}' 0 "Volume hat nur noch GByte frei, nicht mehr TB!"
  116. CheckValueOver NASvolume_2_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.2' '{print substr($4,2,9)}' 500 'Volume voll'
  117. CheckCountMoreThan NASvolume_3_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.6.3' '/STRING: "Ready"/' 0 'Volume offline'
  118. CheckCountMoreThan NASvolume_3_typ_TB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.3' '/ TB"/{print $5}' 0 "Volume hat nur noch GByte frei, nicht mehr TB!"
  119. #CheckValueOver NASvolume_3_free_GB 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.17.1.5.3' '{print substr($4,2,9)}' 500 'Volume voll'
  120. CheckValueUnder NAS_CPU_usage 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.1.0' '{print substr($4,2,9)}' 90 'CPU ausgelastet'
  121. CheckValueOver NAS_RAM_free 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.2.0' '{print substr($4,2,9)}' 200 'RAM voll'
  122. CheckValueUnder NAS_Temp_usage 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.6.0' '{print substr($4,2,4)}' 50 'System zu heiss'
  123. CheckValueUnder NAS_Fan_RPM_high 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.15.1.3.1' '{print substr($4,2,4)}' 1000 'Luefter dreht zu hoch'
  124. CheckValueOver NAS_Fan_RPM_low 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.2.15.1.3.1' '{print substr($4,2,4)}' 500 'Luefter dreht zu niedrig'
  125. CheckCountMoreThan NAS_HDD1_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.4.1' '/STRING: "Good"/{print $0}' 0 'HDD offline'
  126. CheckValueUnder NAS_HDD1_Temp 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.6.1' '{print $4}' 50 'HDD zu heiss'
  127. CheckCountMoreThan NAS_HDD2_status 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.4.2' '/STRING: "Good"/{print $0}' 0 'HDD offline'
  128. CheckValueUnder NAS_HDD2_Temp 'snmpget -v1 -c SNMPCOMMUNITY IPADDRESS 1.3.6.1.4.1.24681.1.4.1.1.1.1.5.2.1.6.2' '{print $4}' 50 'HDD zu heiss'
  129. Category "Ambient Sensors"
  130. #------------------------
  131. # most sensors basically never should trigger, but are used to collect data
  132. FN=$( mktemp )
  133. # generate a log file with a service running
  134. # rtl_433 -C si -F csv:/var/log/rtl_433.log"
  135. fgrep 'MATCH_SENSOR_STRING' /var/log/rtl_433.log \
  136. | tail -n 1 | cut -d ',' -f 9 > $FN
  137. CheckValueOver AirTemperature "cat $FN" '$0' 1 'Danger to the garden! Freezing temperatures'
  138. # "alert" check only to log data
  139. fgrep 'MATCH_SENSOR_STRING' /var/log/rtl_433.log \
  140. | tail -n 1 | cut -d ',' -f 13 > $FN
  141. CheckValueUnder AirHumidity "cat $FN" '$0' 100 'Sensor broken?'
  142. # retrieve power usage/solar generation from an OpenDTU
  143. #-- *old* OpenDTU v24.1.26
  144. curl --retry-max-time 9 --retry 2 -s http://IPADDRESS/api/livedata/status | jq .inverters[0].AC.\"0\".Power.v | cut -d '.' -f 1 > $FN
  145. #-- after OpenDTU v24.2.12
  146. curl --retry-max-time 9 --retry 2 -s http://IPADDRESS/api/livedata/status | jq .total.Power.v | cut -d '.' -f 1 > $FN
  147. CheckValueUnder Solarstrom-OpenDTU "cat $FN" '{ print $0 }' 800 'Solar generation above 800W'
  148. # retrieve power usage/solar generation from a Shelly (Gen1-API)
  149. curl --retry-max-time 9 --retry 2 -s -u USER:PASSWORD --basic http://IPADDRESS/status | jq '.meters[0].power' > $FN
  150. CheckValueUnder Solarstrom-Shelly1pm "cat $FN" '{ print $0 }' 800 'Solar generation above 800W'
  151. # retrieve power usage/solar generation from a ShellyPlugS (Gen2-API)
  152. curl --retry-max-time 9 --retry 2 -s -u USER:PASSWORD --anyauth 'http://IPADDRESS/rpc/Switch.GetStatus?id=0' | jq .apower > $FN
  153. CheckValueUnder Solarstrom-ShellyPlugS "cat $FN" '{ print $0 }' 800 'Solar generation above 800W'
  154. rm $FN
  155. #=========================================================
  156. # Summary
  157. #=========================================================
  158. # CURLAUTH=" --digest --user USERNAME:PASSWORD"
  159. Centralize https://SERVER1/moshel http://SERVER22/moshel http://SERVER333/moshel
  160. CleanUp
  161. # restart the httpd if a PHP check fails (for the first time)
  162. ActionOnAlert PHP_running 'systemctl restart lighttpd'
  163. #############################################################################
  164. # MoSheL: remote server monitoring environment
  165. #
  166. # Copyright (C) 2020- Volker Tanger
  167. #
  168. # Licensed under the EUROPEAN UNION PUBLIC LICENCE v. 1.2 (or later)
  169. # This is free software - see attached file LICENSE_EUPL-1.2_EN.txt
  170. # and available in other languages under
  171. # https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12
  172. #
  173. # For bug reports and suggestions or if you just want to talk to me please
  174. # contact me at volker.tanger@wyae.de
  175. #
  176. # Updates will be available at https://www.wyae.de/software/moshel/
  177. # please check there for updates prior to submitting patches!
  178. #
  179. # For list of changes please refer to the HISTORY file. Thanks.
  180. #############################################################################