Skip to content

Commit 6810f23

Browse files
committed
Fix multi-node kill bug due to memberlist not resolving hostnames; add admin API for inspecting node discovery state
Signed-off-by: tinswzy <zhenyuan.wei@zilliz.com>
1 parent 9024025 commit 6810f23

File tree

9 files changed

+316
-123
lines changed

9 files changed

+316
-123
lines changed

cmd/main.go

Lines changed: 8 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ import (
2020
"context"
2121
"flag"
2222
"fmt"
23-
"github.com/zilliztech/woodpecker/common/logger"
24-
"github.com/zilliztech/woodpecker/common/tracer"
2523
"log"
2624
"net"
2725
"os"
@@ -32,7 +30,9 @@ import (
3230
"github.com/zilliztech/woodpecker/cmd/external"
3331
"github.com/zilliztech/woodpecker/common/config"
3432
commonhttp "github.com/zilliztech/woodpecker/common/http"
33+
"github.com/zilliztech/woodpecker/common/logger"
3534
"github.com/zilliztech/woodpecker/common/membership"
35+
"github.com/zilliztech/woodpecker/common/tracer"
3636
"github.com/zilliztech/woodpecker/server"
3737
)
3838

@@ -54,43 +54,6 @@ func parseAdvertiseAddr(addrPort string) (string, int, error) {
5454

5555
return host, port, nil
5656
}
57-
58-
// resolveAdvertiseAddr resolves hostname to IP address if needed
59-
func resolveAdvertiseAddr(addr string) string {
60-
if addr == "" {
61-
return ""
62-
}
63-
64-
// Check if it's already an IP address
65-
if ip := net.ParseIP(addr); ip != nil {
66-
return addr
67-
}
68-
69-
// Try to resolve hostname to IP
70-
ips, err := net.LookupIP(addr)
71-
if err != nil {
72-
log.Printf("Warning: Failed to resolve hostname '%s' to IP: %v. Using as-is.", addr, err)
73-
return addr
74-
}
75-
76-
// Prefer IPv4 address
77-
for _, ip := range ips {
78-
if ipv4 := ip.To4(); ipv4 != nil {
79-
log.Printf("Resolved hostname '%s' to IPv4: %s", addr, ipv4.String())
80-
return ipv4.String()
81-
}
82-
}
83-
84-
// Fallback to first IP (could be IPv6)
85-
if len(ips) > 0 {
86-
log.Printf("Resolved hostname '%s' to IP: %s", addr, ips[0].String())
87-
return ips[0].String()
88-
}
89-
90-
log.Printf("Warning: No IP found for hostname '%s'. Using as-is.", addr)
91-
return addr
92-
}
93-
9457
func main() {
9558
var (
9659
servicePort = flag.Int("service-port", 18080, "service port")
@@ -180,7 +143,6 @@ func main() {
180143
} else {
181144
advertisePort = *gossipPort
182145
}
183-
resourceAdvertiseGossipAddrStr := resolveAdvertiseAddr(advertiseAddrStr)
184146

185147
if *advertiseServiceAddr != "" {
186148
addr, port, err := parseAdvertiseAddr(*advertiseServiceAddr)
@@ -198,10 +160,10 @@ func main() {
198160
NodeID: *nodeName,
199161
BindPort: *gossipPort,
200162
ServicePort: *servicePort,
201-
AdvertiseAddr: resourceAdvertiseGossipAddrStr, // Gossip advertise address (IP only)
202-
AdvertisePort: advertisePort, // Gossip advertise port
203-
AdvertiseServiceAddr: advertiseServiceAddrStr, // Service advertise address (hostname only)
204-
AdvertiseServicePort: advertiseServicePort, // Service advertise port
163+
AdvertiseAddr: advertiseAddrStr, // Gossip advertise address (IP only)
164+
AdvertisePort: advertisePort, // Gossip advertise port
165+
AdvertiseServiceAddr: advertiseServiceAddrStr, // Service advertise address (hostname only)
166+
AdvertiseServicePort: advertiseServicePort, // Service advertise port
205167
ResourceGroup: *resourceGroup,
206168
AZ: *availabilityZone,
207169
Tags: map[string]string{"role": "logstore"},
@@ -217,10 +179,10 @@ func main() {
217179
}
218180

219181
// Start HTTP server for metrics, health check, and pprof
220-
if err := commonhttp.Start(cfg); err != nil {
182+
if err := commonhttp.Start(cfg, srv.GetServerNodeMemberlistStatus); err != nil {
221183
log.Fatalf("Failed to start HTTP server: %v", err)
222184
}
223-
log.Printf("HTTP server started on port %s (metrics, health, pprof)", commonhttp.DefaultListenPort)
185+
log.Printf("HTTP server started on port %s (metrics, health, pprof, admin)", commonhttp.DefaultListenPort)
224186

225187
log.Printf("Starting Woodpecker Server:")
226188
log.Printf(" Node Name: %s", *nodeName)

common/http/router.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ const MetricsRouterPath = "/metrics"
2525
// LogLevelRouterPath is path for Get and Update log level at runtime.
2626
const LogLevelRouterPath = "/log/level"
2727

28+
// AdminMemberlistPath is path for memberlist status.
29+
const AdminMemberlistPath = "/admin/memberlist"
30+
2831
// Pprof paths are automatically registered by importing net/http/pprof
2932
// Available paths:
3033
// - /debug/pprof/

common/http/server.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,18 @@ func registerDefaults(cfg *config.Configuration) {
102102
}
103103

104104
// Start initializes and starts the HTTP server
105-
func Start(cfg *config.Configuration) error {
105+
func Start(cfg *config.Configuration, GetServerNodeMemberlistStatus func() string) error {
106106
// Register default handlers
107107
registerDefaults(cfg)
108108

109+
// Register admin handler for memberlist status
110+
Register(&Handler{
111+
Path: AdminMemberlistPath,
112+
HandlerFunc: func(writer http.ResponseWriter, request *http.Request) {
113+
fmt.Fprintf(writer, GetServerNodeMemberlistStatus())
114+
},
115+
})
116+
109117
// Get listen port from environment or use default
110118
port := os.Getenv(ListenPortEnvKey)
111119
if port == "" {

common/membership/server_node.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,22 @@ func (n *ServerNode) GetServerCfg() *ServerConfig {
189189
func (n *ServerNode) GetMeta() *proto.NodeMeta {
190190
return n.meta
191191
}
192+
193+
// GetMemberlistStatus returns a formatted string of all memberlist members and their addresses
194+
func (n *ServerNode) GetMemberlistStatus() string {
195+
members := n.memberlist.Members()
196+
if len(members) == 0 {
197+
return "No members in memberlist"
198+
}
199+
200+
result := fmt.Sprintf("Total Members: %d\n\n", len(members))
201+
for i, member := range members {
202+
result += fmt.Sprintf("[%d] Name: %s\n", i+1, member.Name)
203+
result += fmt.Sprintf(" Addr: %s:%d\n", member.Addr.String(), member.Port)
204+
result += fmt.Sprintf(" State: %d\n", member.State)
205+
if i < len(members)-1 {
206+
result += "\n"
207+
}
208+
}
209+
return result
210+
}

common/net/addr.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,35 @@ func GetValidLocalIP(addrs []net.Addr) string {
7777
}
7878
return ""
7979
}
80+
81+
// ResolveAdvertiseAddr resolves hostname to IP address if needed
82+
func ResolveAdvertiseAddr(addr string) net.IP {
83+
if addr == "" {
84+
return nil
85+
}
86+
87+
// Check if it's already an IP address
88+
if ip := net.ParseIP(addr); ip != nil {
89+
return ip
90+
}
91+
92+
// Try to resolve hostname to IP
93+
ips, err := net.LookupIP(addr)
94+
if err != nil {
95+
return nil
96+
}
97+
98+
// Prefer IPv4 address
99+
for _, ip := range ips {
100+
if ipv4 := ip.To4(); ipv4 != nil {
101+
return ipv4
102+
}
103+
}
104+
105+
// Fallback to first IP (could be IPv6)
106+
if len(ips) > 0 {
107+
return ips[0]
108+
}
109+
110+
return nil
111+
}

go.mod

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ require (
1313
github.com/google/gops v0.3.28
1414
github.com/google/uuid v1.6.0
1515
github.com/grafana/pyroscope-go/godeltaprof v0.1.8
16+
github.com/hashicorp/golang-lru/v2 v2.0.7
1617
github.com/hashicorp/memberlist v0.5.3
1718
github.com/json-iterator/go v1.1.12
1819
github.com/klauspost/compress v1.17.9
@@ -29,6 +30,7 @@ require (
2930
github.com/uber/jaeger-client-go v2.30.0+incompatible
3031
go.etcd.io/etcd/client/v3 v3.5.5
3132
go.etcd.io/etcd/server/v3 v3.5.5
33+
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0
3234
go.opentelemetry.io/otel v1.28.0
3335
go.opentelemetry.io/otel/exporters/jaeger v1.13.0
3436
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.20.0
@@ -45,6 +47,8 @@ require (
4547
gopkg.in/yaml.v3 v3.0.1
4648
)
4749

50+
replace github.com/hashicorp/memberlist => github.com/tinswzy/memberlist v0.5.3-hotfix-20251119
51+
4852
require (
4953
cloud.google.com/go/compute/metadata v0.3.0 // indirect
5054
github.com/Azure/azure-sdk-for-go/sdk/internal v1.8.0 // indirect
@@ -81,12 +85,10 @@ require (
8185
github.com/hashicorp/errwrap v1.0.0 // indirect
8286
github.com/hashicorp/go-immutable-radix v1.0.0 // indirect
8387
github.com/hashicorp/go-metrics v0.5.4 // indirect
84-
github.com/hashicorp/go-msgpack v0.5.3 // indirect
8588
github.com/hashicorp/go-msgpack/v2 v2.1.1 // indirect
8689
github.com/hashicorp/go-multierror v1.0.0 // indirect
8790
github.com/hashicorp/go-sockaddr v1.0.0 // indirect
8891
github.com/hashicorp/golang-lru v0.5.1 // indirect
89-
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
9092
github.com/jonboulle/clockwork v0.2.2 // indirect
9193
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
9294
github.com/kr/pretty v0.3.1 // indirect
@@ -129,7 +131,6 @@ require (
129131
go.etcd.io/etcd/client/v2 v2.305.5 // indirect
130132
go.etcd.io/etcd/pkg/v3 v3.5.5 // indirect
131133
go.etcd.io/etcd/raft/v3 v3.5.5 // indirect
132-
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
133134
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.20.0 // indirect
134135
go.opentelemetry.io/otel/metric v1.28.0 // indirect
135136
go.opentelemetry.io/proto/otlp v1.0.0 // indirect

0 commit comments

Comments
 (0)