Unverified Commit bb8fc8a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): external restore, signal-based IPC, and package refactor (#6286)


Co-authored-by: default avatarDan Feigin <dfeigin@nvidia.com>
parent c8423b57
// server.go provides the HTTP server for the checkpoint agent.
package httpApiServer
import (
"context"
"log"
"net/http"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
)
// ServerConfig holds the configuration for the HTTP API server.
type ServerConfig struct {
ListenAddr string
NodeName string
CheckpointSpec *checkpoint.CheckpointSpec
}
// Server is the HTTP API server for checkpoint operations.
type Server struct {
cfg ServerConfig
handlers *Handlers
httpServer *http.Server
}
// NewServer creates a new Server instance.
func NewServer(cfg ServerConfig, checkpointer *checkpoint.Checkpointer) *Server {
handlers := NewHandlers(cfg, checkpointer)
// Setup routes
mux := http.NewServeMux()
mux.HandleFunc("/health", handlers.HandleHealth)
mux.HandleFunc("/checkpoint", handlers.HandleCheckpoint)
mux.HandleFunc("/checkpoints", handlers.HandleListCheckpoints)
// WriteTimeout must exceed the CRIU checkpoint timeout since /checkpoint
// blocks until the dump completes. Add 60s buffer for pre/post work.
writeTimeout := time.Duration(cfg.CheckpointSpec.CRIU.Timeout)*time.Second + 60*time.Second
if writeTimeout < 300*time.Second {
writeTimeout = 300 * time.Second
}
httpServer := &http.Server{
Addr: cfg.ListenAddr,
Handler: LoggingMiddleware(mux),
ReadTimeout: 30 * time.Second,
WriteTimeout: writeTimeout,
IdleTimeout: 120 * time.Second,
}
return &Server{
cfg: cfg,
handlers: handlers,
httpServer: httpServer,
}
}
// Start starts the HTTP server.
// This method blocks until the server is shut down.
func (s *Server) Start() error {
log.Printf("HTTP API server listening on %s", s.cfg.ListenAddr)
return s.httpServer.ListenAndServe()
}
// Shutdown gracefully shuts down the server.
func (s *Server) Shutdown(ctx context.Context) error {
log.Println("Shutting down HTTP server...")
return s.httpServer.Shutdown(ctx)
}
// Addr returns the server's listen address.
func (s *Server) Addr() string {
return s.cfg.ListenAddr
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment