Skip to content

Commit e117439

Browse files
committed
Put the shutting down server into downtime on the Director
- And lift the downtime when the server is back online
1 parent d9d9f9e commit e117439

2 files changed

Lines changed: 34 additions & 6 deletions

File tree

director/director.go

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,16 +1171,15 @@ func registerServerAd(engineCtx context.Context, ctx *gin.Context, sType server_
11711171
adV2.Version = "unknown"
11721172
}
11731173

1174+
sn := adV2.Name
11741175
// Process received server(origin/cache) downtimes and toggle the director's in-memory downtime tracker when necessary
11751176
if adV2.Downtimes != nil {
11761177
filteredServersMutex.Lock()
1177-
defer filteredServersMutex.Unlock()
11781178

11791179
// Update the cached server downtime list
1180-
serverDowntimes[adV2.Name] = adV2.Downtimes
1180+
serverDowntimes[sn] = adV2.Downtimes
11811181

11821182
now := time.Now().UTC().UnixMilli()
1183-
sn := adV2.Name
11841183
active := false // Flag to indicate if this server has active downtime in this server ad
11851184
for _, dt := range adV2.Downtimes {
11861185
if dt.StartTime <= now &&
@@ -1204,6 +1203,35 @@ func registerServerAd(engineCtx context.Context, ctx *gin.Context, sType server_
12041203
delete(filteredServers, sn)
12051204
}
12061205
}
1206+
filteredServersMutex.Unlock()
1207+
}
1208+
1209+
// "Status" represents the server's overall health status. It is introduced in Pelican 7.17.0
1210+
if adV2.Status != "" { // For backward compatibility, we only process this if it is set
1211+
// If the server is about to shutdown, we silently put it into downtime.
1212+
// Then it will not receive new requests from the Director, but it will still be able to serve the existing ones.
1213+
if metrics.ParseHealthStatus(adV2.Status) == metrics.StatusShuttingDown {
1214+
filteredServersMutex.Lock()
1215+
// Inspect the existing downtime status for this server
1216+
existingFilterType, isServerFiltered := filteredServers[sn]
1217+
1218+
// Put the server in downtime only if no filter (downtime) exists or it was tempAllowed
1219+
if !isServerFiltered || existingFilterType == tempAllowed {
1220+
filteredServers[sn] = shutdownFiltered
1221+
log.Debugf("Server %s is shutting down, applying downtime to prevent new transfer requests", sn)
1222+
}
1223+
filteredServersMutex.Unlock()
1224+
} else if metrics.ParseHealthStatus(adV2.Status) != metrics.StatusShuttingDown {
1225+
// If the server is back online, we flush out existing shutdown filter if it exists
1226+
filteredServersMutex.Lock()
1227+
if existingFilterType, isServerFiltered := filteredServers[sn]; isServerFiltered {
1228+
if existingFilterType == shutdownFiltered {
1229+
delete(filteredServers, sn)
1230+
log.Debugf("Removed the active downtime for server %s as it has come back online", sn)
1231+
}
1232+
}
1233+
filteredServersMutex.Unlock()
1234+
}
12071235
}
12081236

12091237
// Forward to other directors, if applicable

docs/parameters.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2850,9 +2850,9 @@ components: ["origin", "cache"]
28502850
---
28512851
name: Xrootd.ShutdownTimeout
28522852
description: |+
2853-
The maximum amount of time pelican will wait for the xrootd daemons to gracefully shutdown.
2854-
During this period, the Director will stop redirect new transfer requests to this xrootd server,
2855-
while in-flight transfers are allowed to proceed until timeout.
2853+
The maximum amount of time pelican will wait for the xrootd daemons to gracefully shutdown.
2854+
During this period, the Director will stop redirect new transfer requests to this xrootd server,
2855+
while in-flight transfers are allowed to proceed until timeout.
28562856
type: duration
28572857
default: 1m
28582858
components: ["origin", "cache"]

0 commit comments

Comments
 (0)