[TASK] add respondd-crashed
a tool to find wrong offline detected nodes. by pinging the nodes which does not answer by respondd
This commit is contained in:
parent
abae92bb5a
commit
2f6e22f8cc
30
contrib/respondd-crashed/README.md
Normal file
30
contrib/respondd-crashed/README.md
Normal file
@ -0,0 +1,30 @@
|
||||
# respondd-crashed
|
||||
|
||||
This tool ping every "offline" node at every ip address of a meshviewer.json to detect, if a respondd deamon is not running anymore.
|
||||
|
||||
|
||||
## give access to run ping
|
||||
```bash
|
||||
sudo setcap cap_net_raw=+ep %GOPATH/bin/respondd-crashed
|
||||
```
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
Usage of respondd-crashed:
|
||||
-ll-iface string
|
||||
interface to ping linklocal-address
|
||||
-loglevel uint
|
||||
Show log message starting at level (default 40)
|
||||
-meshviewer-path string
|
||||
path to meshviewer.json from yanic (default "meshviewer.json")
|
||||
-ping-count int
|
||||
count of pings (default 3)
|
||||
-ping-timeout duration
|
||||
timeout to wait for response (default 5s)
|
||||
-run-every duration
|
||||
repeat check every (default 1m0s)
|
||||
-status-path string
|
||||
path to store status (default "respondd-crashed.json")
|
||||
-timestamps
|
||||
Enables timestamps for log output
|
24
contrib/respondd-crashed/helper.go
Normal file
24
contrib/respondd-crashed/helper.go
Normal file
@ -0,0 +1,24 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
func JSONRequest(url string, value interface{}) error {
|
||||
var netClient = &http.Client{
|
||||
Timeout: time.Second * 20,
|
||||
}
|
||||
|
||||
resp, err := netClient.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = json.NewDecoder(resp.Body).Decode(&value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
1
contrib/respondd-crashed/helper_test.go
Normal file
1
contrib/respondd-crashed/helper_test.go
Normal file
@ -0,0 +1 @@
|
||||
package main
|
34
contrib/respondd-crashed/hook.go
Normal file
34
contrib/respondd-crashed/hook.go
Normal file
@ -0,0 +1,34 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"github.com/bdlm/log"
|
||||
stdLogger "github.com/bdlm/std/logger"
|
||||
)
|
||||
|
||||
type Hook struct{}
|
||||
|
||||
func (hook *Hook) Fire(entry *log.Entry) error {
|
||||
switch entry.Level {
|
||||
case log.PanicLevel:
|
||||
entry.Logger.Out = os.Stderr
|
||||
case log.FatalLevel:
|
||||
entry.Logger.Out = os.Stderr
|
||||
case log.ErrorLevel:
|
||||
entry.Logger.Out = os.Stderr
|
||||
case log.WarnLevel:
|
||||
entry.Logger.Out = os.Stdout
|
||||
case log.InfoLevel:
|
||||
entry.Logger.Out = os.Stdout
|
||||
case log.DebugLevel:
|
||||
entry.Logger.Out = os.Stdout
|
||||
default:
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (hook *Hook) Levels() []stdLogger.Level {
|
||||
return log.AllLevels
|
||||
}
|
86
contrib/respondd-crashed/main.go
Normal file
86
contrib/respondd-crashed/main.go
Normal file
@ -0,0 +1,86 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/bdlm/log"
|
||||
stdLogger "github.com/bdlm/std/logger"
|
||||
"github.com/digineo/go-ping"
|
||||
)
|
||||
|
||||
var (
|
||||
timestamps bool
|
||||
loglevel uint
|
||||
|
||||
runEvery time.Duration
|
||||
|
||||
iface string
|
||||
|
||||
pingCount int
|
||||
pingTimeout time.Duration
|
||||
|
||||
meshviewerPATH string
|
||||
statusPath string
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.BoolVar(×tamps, "timestamps", false, "Enables timestamps for log output")
|
||||
flag.UintVar(&loglevel, "loglevel", 40, "Show log message starting at level")
|
||||
|
||||
flag.DurationVar(&runEvery, "run-every", time.Duration(time.Minute), "repeat check every")
|
||||
|
||||
flag.StringVar(&iface, "ll-iface", "", "interface to ping linklocal-address")
|
||||
|
||||
flag.IntVar(&pingCount, "ping-count", 3, "count of pings")
|
||||
flag.DurationVar(&pingTimeout, "ping-timeout", time.Duration(time.Second*5), "timeout to wait for response")
|
||||
|
||||
flag.StringVar(&statusPath, "status-path", "respondd-crashed.json", "path to store status")
|
||||
flag.StringVar(&meshviewerPATH, "meshviewer-path", "meshviewer.json", "path to meshviewer.json from yanic")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
log.AddHook(&Hook{})
|
||||
log.SetLevel(stdLogger.Level(loglevel))
|
||||
log.SetFormatter(&log.TextFormatter{
|
||||
DisableTimestamp: timestamps,
|
||||
})
|
||||
|
||||
pinger, err := ping.New("", "::")
|
||||
if err != nil {
|
||||
log.Panicf("not able to bind pinger: %s", err)
|
||||
}
|
||||
|
||||
timer := time.NewTimer(runEvery)
|
||||
|
||||
stop := false
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
log.Info("start tester")
|
||||
|
||||
func() {
|
||||
wg.Add(1)
|
||||
for !stop {
|
||||
select {
|
||||
case <-timer.C:
|
||||
run(pinger)
|
||||
timer.Reset(runEvery)
|
||||
}
|
||||
}
|
||||
timer.Stop()
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
sigs := make(chan os.Signal, 1)
|
||||
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
|
||||
sig := <-sigs
|
||||
stop = true
|
||||
wg.Wait()
|
||||
log.Infof("stopped: %s", sig)
|
||||
|
||||
}
|
119
contrib/respondd-crashed/run.go
Normal file
119
contrib/respondd-crashed/run.go
Normal file
@ -0,0 +1,119 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"github.com/bdlm/log"
|
||||
"github.com/digineo/go-ping"
|
||||
|
||||
meshviewerFFRGB "github.com/FreifunkBremen/yanic/output/meshviewer-ffrgb"
|
||||
)
|
||||
|
||||
func pingNode(pinger *ping.Pinger, node *meshviewerFFRGB.Node, addrStr string) bool {
|
||||
logNode := log.WithField("node_id", node.NodeID)
|
||||
|
||||
addr, err := net.ResolveIPAddr("ip6", addrStr)
|
||||
if err != nil {
|
||||
logNode.Warnf("error parse ip address for ping: %s", err)
|
||||
}
|
||||
|
||||
if addrStr[:5] == "fe80:" {
|
||||
if iface == "" {
|
||||
logNode.Debug("skip ll-addr")
|
||||
return false
|
||||
}
|
||||
addr.Zone = iface
|
||||
}
|
||||
logNode = logNode.WithField("addr", addr.String())
|
||||
|
||||
_, err = pinger.PingAttempts(addr, pingTimeout, pingCount)
|
||||
|
||||
logNode.WithFields(map[string]interface{}{
|
||||
"success": err == nil,
|
||||
}).Debug("pong")
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func run(pinger *ping.Pinger) {
|
||||
status := &Status{NodesCrashed: []*Node{}}
|
||||
var meshviewerjson meshviewerFFRGB.Meshviewer
|
||||
|
||||
if meshviewerPATH[:4] == "http" {
|
||||
if err := JSONRequest(meshviewerPATH, &meshviewerjson); err != nil {
|
||||
status.Error = err.Error()
|
||||
log.Errorf("error during fetch meshviewer.json: %s", err)
|
||||
}
|
||||
} else {
|
||||
meshviewerFile, err := os.Open(meshviewerPATH)
|
||||
if err != nil {
|
||||
status.Error = err.Error()
|
||||
log.Errorf("error during fetch meshviewer.json: %s", err)
|
||||
} else if err := json.NewDecoder(meshviewerFile).Decode(&meshviewerjson); err != nil {
|
||||
status.Error = err.Error()
|
||||
log.Errorf("error during decode meshviewer.json: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
log.Debug("fetched meshviewer.json")
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(len(meshviewerjson.Nodes))
|
||||
|
||||
offline := 0
|
||||
for _, node := range meshviewerjson.Nodes {
|
||||
go func(node *meshviewerFFRGB.Node) {
|
||||
defer wg.Done()
|
||||
if node.IsOnline {
|
||||
return
|
||||
}
|
||||
logNode := log.WithField("node", node.NodeID)
|
||||
wgNode := sync.WaitGroup{}
|
||||
wgNode.Add(len(node.Addresses))
|
||||
offline += 1
|
||||
notReachable := true
|
||||
for _, addr := range node.Addresses {
|
||||
go func(node *meshviewerFFRGB.Node, addr string) {
|
||||
if ok := pingNode(pinger, node, addr); ok {
|
||||
notReachable = false
|
||||
}
|
||||
wgNode.Done()
|
||||
}(node, addr)
|
||||
}
|
||||
wgNode.Wait()
|
||||
if !notReachable {
|
||||
logNode.Info("add to crashed list")
|
||||
status.AddNode(node)
|
||||
}
|
||||
}(node)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
status.Lock()
|
||||
status.NodesCount = len(meshviewerjson.Nodes)
|
||||
status.NodesOfflineCount = offline
|
||||
status.Unlock()
|
||||
|
||||
tmpFile := statusPath + ".tmp"
|
||||
statusFile, err := os.OpenFile(tmpFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
|
||||
if err != nil {
|
||||
log.Warnf("unable to open status file: %s", err)
|
||||
}
|
||||
defer statusFile.Close()
|
||||
|
||||
if err := json.NewEncoder(statusFile).Encode(status); err != nil {
|
||||
log.Warnf("unable to write status json: %s", err)
|
||||
}
|
||||
if err := os.Rename(tmpFile, statusPath); err != nil {
|
||||
log.Warnf("unable to move status file: %s", err)
|
||||
}
|
||||
|
||||
log.WithFields(map[string]interface{}{
|
||||
"count_meshviewer": status.NodesCount,
|
||||
"count_offline": status.NodesOfflineCount,
|
||||
"count_status": len(status.NodesCrashed),
|
||||
}).Info("test complete")
|
||||
}
|
31
contrib/respondd-crashed/status.go
Normal file
31
contrib/respondd-crashed/status.go
Normal file
@ -0,0 +1,31 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
meshviewerFFRGB "github.com/FreifunkBremen/yanic/output/meshviewer-ffrgb"
|
||||
)
|
||||
|
||||
type Node struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Hostname string `json:"hostname"`
|
||||
Addresses []string `json:"addresses"`
|
||||
}
|
||||
|
||||
type Status struct {
|
||||
Error string `json:"error,omitempty"`
|
||||
NodesCount int `json:"nodes_count"`
|
||||
NodesOfflineCount int `json:"nodes_offline_count"`
|
||||
NodesCrashed []*Node `json:"nodes_crashed"`
|
||||
sync.Mutex
|
||||
}
|
||||
|
||||
func (s *Status) AddNode(node *meshviewerFFRGB.Node) {
|
||||
s.Lock()
|
||||
s.NodesCrashed = append(s.NodesCrashed, &Node{
|
||||
NodeID: node.NodeID,
|
||||
Hostname: node.Hostname,
|
||||
Addresses: node.Addresses,
|
||||
})
|
||||
s.Unlock()
|
||||
}
|
Loading…
Reference in New Issue
Block a user