-
Notifications
You must be signed in to change notification settings - Fork 283
bridge: add guest-side reconnect loop for live migration #2698
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -189,6 +189,15 @@ type Bridge struct { | |
| hasQuitPending atomic.Bool | ||
|
|
||
| protVer prot.ProtocolVersion | ||
|
|
||
| // Publisher is a stable notification sink that survives bridge recreation | ||
| // during live migration. | ||
| Publisher *Publisher | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why exported? |
||
| } | ||
|
|
||
| // ShutdownRequested returns true if the bridge has been asked to shut down. | ||
| func (b *Bridge) ShutdownRequested() bool { | ||
| return b.hasQuitPending.Load() | ||
| } | ||
|
|
||
| // AssignHandlers creates and assigns the appropriate bridge | ||
|
|
@@ -226,17 +235,20 @@ func (b *Bridge) AssignHandlers(mux *Mux, host *hcsv2.Host) { | |
| func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser) error { | ||
| requestChan := make(chan *Request) | ||
| requestErrChan := make(chan error) | ||
| b.responseChan = make(chan bridgeResponse) | ||
| b.responseChan = make(chan bridgeResponse, 16) | ||
| responseErrChan := make(chan error) | ||
| b.quitChan = make(chan bool) | ||
|
|
||
| defer close(b.quitChan) | ||
| // Close order matters: quitChan must close first so PublishNotification | ||
| // and in-flight handlers see it before responseChan becomes invalid. | ||
| // responseChan is never explicitly closed — the response writer exits | ||
| // when quitChan closes and no more sends are possible. | ||
| defer bridgeIn.Close() | ||
| defer close(requestErrChan) | ||
| defer close(requestChan) | ||
| defer bridgeOut.Close() | ||
| defer close(responseErrChan) | ||
| defer close(b.responseChan) | ||
| defer close(requestChan) | ||
| defer close(requestErrChan) | ||
| defer bridgeIn.Close() | ||
| defer close(b.quitChan) | ||
|
|
||
| // Receive bridge requests and schedule them to be processed. | ||
| go func() { | ||
|
|
@@ -440,7 +452,20 @@ func (b *Bridge) PublishNotification(n *prot.ContainerNotification) { | |
| }, | ||
| response: n, | ||
| } | ||
| b.responseChan <- resp | ||
| // Check quitChan first to avoid sending to a dead bridge. | ||
| select { | ||
| case <-b.quitChan: | ||
| logrus.WithField("containerID", n.ContainerID). | ||
| Warn("bridge quit, dropping notification") | ||
| return | ||
| default: | ||
| } | ||
| select { | ||
| case b.responseChan <- resp: | ||
| case <-b.quitChan: | ||
| logrus.WithField("containerID", n.ContainerID). | ||
| Warn("bridge quit, dropping notification") | ||
| } | ||
| } | ||
|
|
||
| // setErrorForResponseBase modifies the passed-in MessageResponseBase to | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| //go:build linux | ||
|
|
||
| package bridge | ||
|
|
||
| import ( | ||
| "sync" | ||
|
|
||
| "github.com/Microsoft/hcsshim/internal/guest/prot" | ||
| "github.com/sirupsen/logrus" | ||
| ) | ||
|
|
||
| // Publisher provides a stable reference for container exit goroutines | ||
| // to publish notifications through. It survives bridge recreation | ||
| // during live migration — when the bridge is nil, notifications are dropped. | ||
| type Publisher struct { | ||
| mu sync.Mutex | ||
| b *Bridge | ||
| } | ||
|
|
||
| // SetBridge attaches or detaches the current bridge. | ||
| // Pass nil to detach (notifications will be dropped until a new bridge is set). | ||
| func (p *Publisher) SetBridge(b *Bridge) { | ||
| p.mu.Lock() | ||
| defer p.mu.Unlock() | ||
| p.b = b | ||
| } | ||
|
|
||
| // Publish sends a container notification to the current bridge. | ||
| // If no bridge is connected, the notification is dropped with a warning. | ||
| func (p *Publisher) Publish(n *prot.ContainerNotification) { | ||
| p.mu.Lock() | ||
| defer p.mu.Unlock() | ||
| if p.b == nil { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you just use a queue here? On Publish if nil, append, else drain and publish this one. on SetBridge, drain.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we drop exit events the shim will never understand that state I dont think |
||
| logrus.WithField("containerID", n.ContainerID). | ||
| Warn("bridge not connected, dropping container notification") | ||
| return | ||
| } | ||
| p.b.PublishNotification(n) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| //go:build linux | ||
|
|
||
| package bridge | ||
|
|
||
| import ( | ||
| "testing" | ||
|
|
||
| "github.com/Microsoft/hcsshim/internal/guest/prot" | ||
| ) | ||
|
|
||
| func TestPublisher_NilBridge(t *testing.T) { | ||
| p := &Publisher{} | ||
| // Should not panic when bridge is nil | ||
| p.Publish(&prot.ContainerNotification{ | ||
| MessageBase: prot.MessageBase{ContainerID: "test"}, | ||
| }) | ||
| } | ||
|
|
||
| func TestPublisher_SetBridgeNil(t *testing.T) { | ||
| p := &Publisher{} | ||
| p.SetBridge(nil) | ||
| // Should not panic | ||
| p.Publish(&prot.ContainerNotification{ | ||
| MessageBase: prot.MessageBase{ContainerID: "test"}, | ||
| }) | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.