[solved] new version of watchdog

Issue #198 resolved
rs232 created an issue

I had sometime today to look into the watchdog script. I’ve found few items I could optimise. The core messages here are:

Changelog

  • don't use nslookup if targets are defined as IP
  • general code optimisation
  • better condition statements
  • optimised all the watchdog methods (traceroute/ping/curl)
  • traceroute doesn’t work properly when FT is behind a main router (received buffer will always be greater after the traceroute is run as the gateway will respond) . As a matter of facts I would suggest not to use it and certainly remove it from the default method which should be ping instead IMO.

2020.8 watchdog:
real 0m 13.88s
user 0m 0.14s
sys 0m 0.23s

New watchdog:
real 0m 0.33s
user 0m 0.12s
sys 0m 0.15s

We need to add some note on the GUI covering these points:

  • ping: make sure the item is pingable under normal circumstances
  • curl: make sure there’s a webserver responding at the provided host
  • traceroute: might not work as expected in certain scenarios
  • It is advised to define only 1 target (2 it’s ok but are we worried about google being down?)
  • It is advised to use IP instead of FQDN (to avoid lookup processing)

P.S. I’m sure I can go further with the optimisation but prefer this to be tested first.

#!/bin/sh
# Copyright (C) 2015 shibby
# changes/fixes: 2018 - 2021 pedro
# optimised: 2022 rs232

PID=$$
PIDFILE="/var/run/watchdog.pid"
IPLISTFILE="/tmp/watchdog.iplist"
MWAN=$(nvram get mwan_num)
IPLIST=""
TMP_ROUTE_TABLE_ID=555
MWANTABLE="wan"
wh="\033[1;37m"
bl="\033[0;40m"


echo -e "
─────────────────── WatchDog ───────────────────"

i=1
while [ $i -le $MWAN ]; do
    [ "$i" -gt 1 ] && MWANTABLE="$MWANTABLE wan$i"
    i=$((i+1))
done

LOGS="logger -t watchdog[$PID]"
[ "$(nvram get mwan_debug)" -gt 0 ] && DEBUG="logger -p DEBUG -t watchdog[$PID] *** " || DEBUG="echo -e"

timeout() {
    local cmd_pid sleep_pid retval
    (shift; "$@") &
    cmd_pid=$!
    (sleep "$1"; kill "$cmd_pid" 2>/dev/null) &
    sleep_pid=$!
    wait "$cmd_pid"
    retval=$?
    kill "$sleep_pid" 2>/dev/null
    return "$retval"
}

findHost() {
    local host ip
    local dst=$(nvram get mwan_ckdst)
    local hostlist=$(echo $dst | sed 's/,/ /')

    for host in $hostlist; do
        echo $host | grep -Eo '((([a-zA-Z]{1,2})|([0-9]{1,2})|([a-zA-Z0-9]{1,2})|([a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]))\.)+[a-zA-Z]{2,6}' && {
        ip=$(nslookup $host 127.0.0.1 2>/dev/null | tail +5 | grep -v :: | awk '{print $3} | tail -1')
        IPLIST="${IPLIST} $ip"
        } || IPLIST="${IPLIST} $host"
    done

    echo "$IPLIST" > $IPLISTFILE # because we're in subshell
}

watchdogRun() {
    for PREFIX in $MWANTABLE; do
        IFACE=$(nvram get "$PREFIX"_iface)
        ISPPPD=$([ -f /tmp/ppp/pppd$PREFIX ] && echo 1 || echo 0)
        WEIGHT=$(nvram get "$PREFIX"_weight)
        METHOD=$(nvram get "$PREFIX"_ckmtd)
        PROTO=$(nvram get "$PREFIX"_proto)
        DEMAND=$(nvram get "$PREFIX"_ppp_demand)
        RESULT=0
        PREFIX_MWAN=$PREFIX
        STATE_FILE="/var/lib/misc/"$PREFIX"_state"

        [ "$(nvram get "$PREFIX"_ck_pause)" -eq 1 ] && {
            $DEBUG "Watchdog paused for $PREFIX - skipping ..."
            continue
        }

        [ "$PROTO" != "disabled" ] && {
            [ "$(nvram get mwan_debug)" -gt 0 ] && {
                ISUP=$(wanuptime "$PREFIX")
                ISGW=$(ip route | grep $IFACE | grep -v "link" | wc -l)
                $DEBUG "prefix=$PREFIX, iface=$IFACE, uptime=$ISUP, ISGW=$ISGW, WEIGHT=$WEIGHT"
            }

            DEFAULT_ROUTE_FRAGMENT=$(ip route | grep default | cut -d' ' -f2-)
            GATEWAY_FRAGMENT="via $(nvram get "$PREFIX"_gateway)"
            [ "$ISPPPD" -eq 1 ] && GATEWAY_FRAGMENT=""
            for IP in $IPLIST; do
                [ ! -z "$DEFAULT_ROUTE_FRAGMENT" ] && {
                    ROUTE_EXEC="ip route add $IP $DEFAULT_ROUTE_FRAGMENT"
                    $DEBUG $ROUTE_EXEC
                    $ROUTE_EXEC
                }
                ROUTE_EXEC="ip route add $IP dev $IFACE $GATEWAY_FRAGMENT metric 50000"
                $DEBUG $ROUTE_EXEC
                $ROUTE_EXEC
            done

            if [ "$METHOD" -eq 1 ]; then
                ckping
            elif [ "$METHOD" -eq 2 ]; then
                cktracert
            else
                ckcurl
            fi

            for IP in $IPLIST; do
                [ ! -z "$DEFAULT_ROUTE_FRAGMENT" ] && {
                    ROUTE_EXEC="ip route del $IP $DEFAULT_ROUTE_FRAGMENT"
                    $DEBUG $ROUTE_EXEC
                    $ROUTE_EXEC
                }
                ROUTE_EXEC="ip route del $IP dev $IFACE $GATEWAY_FRAGMENT metric 50000"
                $DEBUG $ROUTE_EXEC
                $ROUTE_EXEC
            done

            # wan is down
            [ "$RESULT" -eq 0 ] && {
                [ "$PROTO" == "lte" ] && {
                    $LOGS "Connection $PREFIX DOWN - Reconnecting ..."
                    echo "0" > $STATE_FILE
                    switch4g $PREFIX
                } || {
                    [ "$PREFIX" == "wan" -a "$MWAN" -gt 1 ] && PREFIX_MWAN="wan1" # "wan" means restart all WANs, but we only want restart one

                    [ "$(nvram get action_service)" == "wan-restart" -o "$(nvram get action_service)" == $PREFIX_MWAN"-restart" -o "$(nvram get action_service)" == "wan-restart-c" -o "$(nvram get action_service)" == $PREFIX_MWAN"-restart-c" ] && {
                        $LOGS "Connection $PREFIX DOWN - Reconnect is already in progress ..."
                    } || {
                        echo "0" > $STATE_FILE

                        if [ "$PROTO" == "pppoe" -o "$PROTO" == "pptp" -o "$PROTO" == "l2tp" -o "$PROTO" == "ppp3g" ] && [ "$DEMAND" -eq 1 -a "$ISPPPD" -eq 0 ]; then
                            $LOGS "Killing orphaned connect-on-demand listen process ..."
                            LISTEN_PID=$(ps | grep listen | grep $PREFIX | awk '{print $1}' | head -n1)
                            [ -n $LISTEN_PID ] && {
                                kill -9 $LISTEN_PID
                                $LOGS "Killed $LISTEN_PID"
                            } || {
                                $LOGS "Connect-on-demand listen not running"
                            }

                            $LOGS "Connection $PREFIX DOWN - Reconnecting ..."
                            service $PREFIX_MWAN restart
                        else
                            $LOGS "Connection $PREFIX DOWN - Reconnect will be handled by another process ..."
                        fi
                    }
                }
            } || {
                [ "$PROTO" == "dhcp" -a "$(cat $STATE_FILE)" -eq 0 ] && { # connected + DHCP + previous status - disconnected? release/renew
                    dhcpc-release $PREFIX
                    sleep 1
                    dhcpc-renew $PREFIX
                }
                $DEBUG "$wh Connection $PREFIX is functioning $bl"
                echo "1" > $STATE_FILE
            }
        }
    done
}

cktracert() {
    local RXBYTES1=$(cat /sys/class/net/$IFACE/statistics/rx_bytes)
    local RXBYTES2 IP

    $DEBUG "run tracert for $IFACE ..."

    for IP in $IPLIST; do
        # we need only send/receive few packages to be sure is connection works.
        traceroute -i $IFACE -n -w 1 -m 4 -q 1 -z $IP > /dev/null 2>&1 
    done

    RXBYTES2=$(cat /sys/class/net/$IFACE/statistics/rx_bytes)

    [ "$RXBYTES2" -gt "$RXBYTES1" ] && RESULT=1
    $DEBUG "$tracert for $IFACE: RX2=$RXBYTES2 RX1=$RXBYTES1"
}

ckping() {
    local IP CHECK

    $DEBUG "run ping for $IFACE ..."

    for IP in $IPLIST; do

        # "0" means 100% loss - not receive any package
        ping -c 3 -A -W 4 -w 4 -I $IFACE $IP >/dev/null && { RESULT=$((RESULT+1)) 
        $DEBUG "$IFACE - $IP: $CHECK; ping OK=$RESULT"
        break;
            }
    done

    [ "$RESULT" -gt 0 ] && $DEBUG "ping for $IFACE: OK=1"
}

ckcurl() {
    local IP CHECK

    $DEBUG "run curl connect for $IFACE ..."

    for IP in $IPLIST; do

        curl $IP --interface $IFACE --connect-timeout 5 -ksfI -o /dev/null && { RESULT=$((RESULT+1))
        $DEBUG "$IFACE - $IP: $CHECK; curl connect OK=$RESULT"
        break;
            }
    done

    [ "$RESULT" -gt 0 ] && $DEBUG "curl connect for $IFACE: OK=1"
}

watchdogAdd() {
    local CKTIME=$(nvram get mwan_cktime)
    local MINS=$((CKTIME/60))

    [ "$MINS" -gt 0 ] && {
        cru l | grep watchdogJob >/dev/null && cru a watchdogJob "*/$MINS * * * * /usr/sbin/watchdog"
    }
}

watchdogDel() {
    cru l | grep watchdogJob >/dev/null && cru d watchdogJob
}

mwanJob() {
    cru l | grep mwanJob >/dev/null  && cru d mwanJob || cru a mwanJob "*/1 * * * * /usr/sbin/watchdog alive"
}

mwanAlive() {
    [ "$MWAN" -gt 1 ] && {
        ps | grep [m]wanroute >/dev/null || {
            $LOGS "mwanroute not found, launch process"
            mwanroute
        }
    }
}

checkPid() {
    local PIDNO

    [ -f $PIDFILE ] && {
        PIDNO=$(cat $PIDFILE)
        cat "/proc/$PIDNO/cmdline" > /dev/null 2>&1

        [ $? -eq 0 ] && {
            $LOGS "Another process in action - Exiting ..."
            exit 0
        } || {
            # Process not found assume not running
            echo $PID > $PIDFILE
            [ $? -ne 0 ] && {
                $LOGS "Could not create PID file"
                exit 0
            }
        }
    } || {
        echo $PID > $PIDFILE
        [ $? -ne 0 ] && {
            $LOGS "Could not create PID file"
            exit 0
        }
    }
}

checkPidSwitch() {
    local SPREFIX

    for SPREFIX in $MWANTABLE; do
        [ -f /var/run/switch3g_$SPREFIX.pid ] && {
            ps | grep [s]witch3g && {
                $LOGS "Switch3g ($SPREFIX) script in action - Exiting ..."
                rm -f $PIDFILE > /dev/null 2>&1
                exit 0          
            } || {
                # pid file exists but process doesn't
                rm /var/run/switch3g_$SPREFIX.pid
            }
        }

        [ -f /var/run/switch4g_$SPREFIX.pid ] && {
            ps | grep [s]witch4g && {
                $LOGS "Switch4g ($SPREFIX) script in action - Exiting ..."
                rm -f $PIDFILE > /dev/null 2>&1
                exit 0
            } || {
                # pid file exists but process doesn't
                rm /var/run/switch4g_$SPREFIX.pid
                }
        }
    done
}


###################################################


if [ "$1" == "add" ]; then
    watchdogAdd
    mwanJob
elif [ "$1" == "del" ]; then
    watchdogDel
elif [ "$1" == "alive" ]; then
    mwanAlive
else
    checkPid

    checkPidSwitch

    mwanJob

    # run with a 10 sec timeout to not hang
    timeout 10 findHost
    [ -f $IPLISTFILE ] && IPLIST=$(cat $IPLISTFILE)
    [ -z "$IPLIST" ] && IPLIST="1.1.1.1" # resilient IP if the list is empty
    watchdogRun
    rm $IPLISTFILE
fi
echo "─────────────────────────────────────────────────"

rm -f $PIDFILE > /dev/null 2>&1

Comments (7)

  1. pedro repo owner

    Please send me this file via PM system on linksysinfo.org, because here the above form destroys entire formatting (for ex. changing tabs to spaces).

  2. rs232 reporter

    Leave it with me I’ve identified something that appears to be broken in the original implementation. I’ll try to fix that too first.

  3. Log in to comment