- edited description
[solved] new version of watchdog
I had sometime today to look into the watchdog script. I’ve found few items I could optimise. The core messages here are:
Changelog
- don't use nslookup if targets are defined as IP
- general code optimisation
- better condition statements
- optimised all the watchdog methods (traceroute/ping/curl)
- traceroute doesn’t work properly when FT is behind a main router (received buffer will always be greater after the traceroute is run as the gateway will respond) . As a matter of facts I would suggest not to use it and certainly remove it from the default method which should be ping instead IMO.
2020.8 watchdog:
real 0m 13.88s
user 0m 0.14s
sys 0m 0.23sNew watchdog:
real 0m 0.33s
user 0m 0.12s
sys 0m 0.15s
We need to add some note on the GUI covering these points:
- ping: make sure the item is pingable under normal circumstances
- curl: make sure there’s a webserver responding at the provided host
- traceroute: might not work as expected in certain scenarios
- It is advised to define only 1 target (2 it’s ok but are we worried about google being down?)
- It is advised to use IP instead of FQDN (to avoid lookup processing)
P.S. I’m sure I can go further with the optimisation but prefer this to be tested first.
#!/bin/sh
# Copyright (C) 2015 shibby
# changes/fixes: 2018 - 2021 pedro
# optimised: 2022 rs232
PID=$$
PIDFILE="/var/run/watchdog.pid"
IPLISTFILE="/tmp/watchdog.iplist"
MWAN=$(nvram get mwan_num)
IPLIST=""
TMP_ROUTE_TABLE_ID=555
MWANTABLE="wan"
wh="\033[1;37m"
bl="\033[0;40m"
echo -e "
─────────────────── WatchDog ───────────────────"
i=1
while [ $i -le $MWAN ]; do
[ "$i" -gt 1 ] && MWANTABLE="$MWANTABLE wan$i"
i=$((i+1))
done
LOGS="logger -t watchdog[$PID]"
[ "$(nvram get mwan_debug)" -gt 0 ] && DEBUG="logger -p DEBUG -t watchdog[$PID] *** " || DEBUG="echo -e"
timeout() {
local cmd_pid sleep_pid retval
(shift; "$@") &
cmd_pid=$!
(sleep "$1"; kill "$cmd_pid" 2>/dev/null) &
sleep_pid=$!
wait "$cmd_pid"
retval=$?
kill "$sleep_pid" 2>/dev/null
return "$retval"
}
findHost() {
local host ip
local dst=$(nvram get mwan_ckdst)
local hostlist=$(echo $dst | sed 's/,/ /')
for host in $hostlist; do
echo $host | grep -Eo '((([a-zA-Z]{1,2})|([0-9]{1,2})|([a-zA-Z0-9]{1,2})|([a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]))\.)+[a-zA-Z]{2,6}' && {
ip=$(nslookup $host 127.0.0.1 2>/dev/null | tail +5 | grep -v :: | awk '{print $3} | tail -1')
IPLIST="${IPLIST} $ip"
} || IPLIST="${IPLIST} $host"
done
echo "$IPLIST" > $IPLISTFILE # because we're in subshell
}
watchdogRun() {
for PREFIX in $MWANTABLE; do
IFACE=$(nvram get "$PREFIX"_iface)
ISPPPD=$([ -f /tmp/ppp/pppd$PREFIX ] && echo 1 || echo 0)
WEIGHT=$(nvram get "$PREFIX"_weight)
METHOD=$(nvram get "$PREFIX"_ckmtd)
PROTO=$(nvram get "$PREFIX"_proto)
DEMAND=$(nvram get "$PREFIX"_ppp_demand)
RESULT=0
PREFIX_MWAN=$PREFIX
STATE_FILE="/var/lib/misc/"$PREFIX"_state"
[ "$(nvram get "$PREFIX"_ck_pause)" -eq 1 ] && {
$DEBUG "Watchdog paused for $PREFIX - skipping ..."
continue
}
[ "$PROTO" != "disabled" ] && {
[ "$(nvram get mwan_debug)" -gt 0 ] && {
ISUP=$(wanuptime "$PREFIX")
ISGW=$(ip route | grep $IFACE | grep -v "link" | wc -l)
$DEBUG "prefix=$PREFIX, iface=$IFACE, uptime=$ISUP, ISGW=$ISGW, WEIGHT=$WEIGHT"
}
DEFAULT_ROUTE_FRAGMENT=$(ip route | grep default | cut -d' ' -f2-)
GATEWAY_FRAGMENT="via $(nvram get "$PREFIX"_gateway)"
[ "$ISPPPD" -eq 1 ] && GATEWAY_FRAGMENT=""
for IP in $IPLIST; do
[ ! -z "$DEFAULT_ROUTE_FRAGMENT" ] && {
ROUTE_EXEC="ip route add $IP $DEFAULT_ROUTE_FRAGMENT"
$DEBUG $ROUTE_EXEC
$ROUTE_EXEC
}
ROUTE_EXEC="ip route add $IP dev $IFACE $GATEWAY_FRAGMENT metric 50000"
$DEBUG $ROUTE_EXEC
$ROUTE_EXEC
done
if [ "$METHOD" -eq 1 ]; then
ckping
elif [ "$METHOD" -eq 2 ]; then
cktracert
else
ckcurl
fi
for IP in $IPLIST; do
[ ! -z "$DEFAULT_ROUTE_FRAGMENT" ] && {
ROUTE_EXEC="ip route del $IP $DEFAULT_ROUTE_FRAGMENT"
$DEBUG $ROUTE_EXEC
$ROUTE_EXEC
}
ROUTE_EXEC="ip route del $IP dev $IFACE $GATEWAY_FRAGMENT metric 50000"
$DEBUG $ROUTE_EXEC
$ROUTE_EXEC
done
# wan is down
[ "$RESULT" -eq 0 ] && {
[ "$PROTO" == "lte" ] && {
$LOGS "Connection $PREFIX DOWN - Reconnecting ..."
echo "0" > $STATE_FILE
switch4g $PREFIX
} || {
[ "$PREFIX" == "wan" -a "$MWAN" -gt 1 ] && PREFIX_MWAN="wan1" # "wan" means restart all WANs, but we only want restart one
[ "$(nvram get action_service)" == "wan-restart" -o "$(nvram get action_service)" == $PREFIX_MWAN"-restart" -o "$(nvram get action_service)" == "wan-restart-c" -o "$(nvram get action_service)" == $PREFIX_MWAN"-restart-c" ] && {
$LOGS "Connection $PREFIX DOWN - Reconnect is already in progress ..."
} || {
echo "0" > $STATE_FILE
if [ "$PROTO" == "pppoe" -o "$PROTO" == "pptp" -o "$PROTO" == "l2tp" -o "$PROTO" == "ppp3g" ] && [ "$DEMAND" -eq 1 -a "$ISPPPD" -eq 0 ]; then
$LOGS "Killing orphaned connect-on-demand listen process ..."
LISTEN_PID=$(ps | grep listen | grep $PREFIX | awk '{print $1}' | head -n1)
[ -n $LISTEN_PID ] && {
kill -9 $LISTEN_PID
$LOGS "Killed $LISTEN_PID"
} || {
$LOGS "Connect-on-demand listen not running"
}
$LOGS "Connection $PREFIX DOWN - Reconnecting ..."
service $PREFIX_MWAN restart
else
$LOGS "Connection $PREFIX DOWN - Reconnect will be handled by another process ..."
fi
}
}
} || {
[ "$PROTO" == "dhcp" -a "$(cat $STATE_FILE)" -eq 0 ] && { # connected + DHCP + previous status - disconnected? release/renew
dhcpc-release $PREFIX
sleep 1
dhcpc-renew $PREFIX
}
$DEBUG "$wh Connection $PREFIX is functioning $bl"
echo "1" > $STATE_FILE
}
}
done
}
cktracert() {
local RXBYTES1=$(cat /sys/class/net/$IFACE/statistics/rx_bytes)
local RXBYTES2 IP
$DEBUG "run tracert for $IFACE ..."
for IP in $IPLIST; do
# we need only send/receive few packages to be sure is connection works.
traceroute -i $IFACE -n -w 1 -m 4 -q 1 -z $IP > /dev/null 2>&1
done
RXBYTES2=$(cat /sys/class/net/$IFACE/statistics/rx_bytes)
[ "$RXBYTES2" -gt "$RXBYTES1" ] && RESULT=1
$DEBUG "$tracert for $IFACE: RX2=$RXBYTES2 RX1=$RXBYTES1"
}
ckping() {
local IP CHECK
$DEBUG "run ping for $IFACE ..."
for IP in $IPLIST; do
# "0" means 100% loss - not receive any package
ping -c 3 -A -W 4 -w 4 -I $IFACE $IP >/dev/null && { RESULT=$((RESULT+1))
$DEBUG "$IFACE - $IP: $CHECK; ping OK=$RESULT"
break;
}
done
[ "$RESULT" -gt 0 ] && $DEBUG "ping for $IFACE: OK=1"
}
ckcurl() {
local IP CHECK
$DEBUG "run curl connect for $IFACE ..."
for IP in $IPLIST; do
curl $IP --interface $IFACE --connect-timeout 5 -ksfI -o /dev/null && { RESULT=$((RESULT+1))
$DEBUG "$IFACE - $IP: $CHECK; curl connect OK=$RESULT"
break;
}
done
[ "$RESULT" -gt 0 ] && $DEBUG "curl connect for $IFACE: OK=1"
}
watchdogAdd() {
local CKTIME=$(nvram get mwan_cktime)
local MINS=$((CKTIME/60))
[ "$MINS" -gt 0 ] && {
cru l | grep watchdogJob >/dev/null && cru a watchdogJob "*/$MINS * * * * /usr/sbin/watchdog"
}
}
watchdogDel() {
cru l | grep watchdogJob >/dev/null && cru d watchdogJob
}
mwanJob() {
cru l | grep mwanJob >/dev/null && cru d mwanJob || cru a mwanJob "*/1 * * * * /usr/sbin/watchdog alive"
}
mwanAlive() {
[ "$MWAN" -gt 1 ] && {
ps | grep [m]wanroute >/dev/null || {
$LOGS "mwanroute not found, launch process"
mwanroute
}
}
}
checkPid() {
local PIDNO
[ -f $PIDFILE ] && {
PIDNO=$(cat $PIDFILE)
cat "/proc/$PIDNO/cmdline" > /dev/null 2>&1
[ $? -eq 0 ] && {
$LOGS "Another process in action - Exiting ..."
exit 0
} || {
# Process not found assume not running
echo $PID > $PIDFILE
[ $? -ne 0 ] && {
$LOGS "Could not create PID file"
exit 0
}
}
} || {
echo $PID > $PIDFILE
[ $? -ne 0 ] && {
$LOGS "Could not create PID file"
exit 0
}
}
}
checkPidSwitch() {
local SPREFIX
for SPREFIX in $MWANTABLE; do
[ -f /var/run/switch3g_$SPREFIX.pid ] && {
ps | grep [s]witch3g && {
$LOGS "Switch3g ($SPREFIX) script in action - Exiting ..."
rm -f $PIDFILE > /dev/null 2>&1
exit 0
} || {
# pid file exists but process doesn't
rm /var/run/switch3g_$SPREFIX.pid
}
}
[ -f /var/run/switch4g_$SPREFIX.pid ] && {
ps | grep [s]witch4g && {
$LOGS "Switch4g ($SPREFIX) script in action - Exiting ..."
rm -f $PIDFILE > /dev/null 2>&1
exit 0
} || {
# pid file exists but process doesn't
rm /var/run/switch4g_$SPREFIX.pid
}
}
done
}
###################################################
if [ "$1" == "add" ]; then
watchdogAdd
mwanJob
elif [ "$1" == "del" ]; then
watchdogDel
elif [ "$1" == "alive" ]; then
mwanAlive
else
checkPid
checkPidSwitch
mwanJob
# run with a 10 sec timeout to not hang
timeout 10 findHost
[ -f $IPLISTFILE ] && IPLIST=$(cat $IPLISTFILE)
[ -z "$IPLIST" ] && IPLIST="1.1.1.1" # resilient IP if the list is empty
watchdogRun
rm $IPLISTFILE
fi
echo "─────────────────────────────────────────────────"
rm -f $PIDFILE > /dev/null 2>&1
Comments (7)
-
reporter -
reporter - edited description
-
reporter - edited description
-
reporter - edited description
-
repo owner Please send me this file via PM system on linksysinfo.org, because here the above form destroys entire formatting (for ex. changing tabs to spaces).
-
reporter Leave it with me I’ve identified something that appears to be broken in the original implementation. I’ll try to fix that too first.
-
reporter - changed status to resolved
I'll close this here and send the script to Pedro (after final tweaks) on the forum.
- Log in to comment