Skip to content

Commit

Permalink
fix(host): 修复 dirty shutdown 机制和 reconcile 的冲突问题 (#21492)
Browse files Browse the repository at this point in the history
  • Loading branch information
zexi authored Oct 29, 2024
1 parent 5f306f3 commit cc0b8ce
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 18 deletions.
2 changes: 1 addition & 1 deletion pkg/hostman/guestman/guestman.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ func NewGuestManager(host hostutils.IHost, serversPath string, workerCnt int) (*
manager.containerRuntimeManager = runtimeMan
manager.pleg = pleg.NewGenericPLEG(runtimeMan, pleg.ChannelCapacity, pleg.RelistPeriod, manager.podCache, clock.RealClock{})
manager.pleg.Start()
manager.startContainerSyncLoop()
}
return manager, nil
}
Expand Down Expand Up @@ -441,6 +440,7 @@ func (m *SGuestManager) OnLoadExistingGuestsComplete() {
if !options.HostOptions.EnableCpuBinding {
m.ClenaupCpuset()
}
m.startContainerSyncLoop()
}

func (m *SGuestManager) verifyDirtyServers() {
Expand Down
32 changes: 15 additions & 17 deletions pkg/hostman/guestman/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,11 @@ func (s *sPodGuestInstance) ImportServer(pendingDelete bool) {
s.manager.SaveServer(s.Id, s)
s.manager.RemoveCandidateServer(s)
if s.IsDaemon() || s.IsDirtyShutdown() {
/*ctx := context.Background()
ctx := context.Background()
cred := hostutils.GetComputeSession(ctx).GetToken()
if err := s.StartLocalPod(ctx, cred); err != nil {
if err := s.StartLocalDirtyPod(ctx, cred); err != nil {
log.Errorf("start local pod err %s", err.Error())
}*/
log.Warningf("pod %s need started, waiting sync loop to manage it", s.GetName())
s.SyncStatus(fmt.Sprintf("sync status is_dirty_shutdown: %v, is_daemon: %v", s.IsDirtyShutdown(), s.IsDaemon()))
}
} else {
s.SyncStatus("sync status after host started")
s.getProbeManager().AddPod(s.Desc)
Expand Down Expand Up @@ -688,44 +686,44 @@ func (s *sPodGuestInstance) getCgroupParent() string {
return "/cloudpods"
}

type localPodStartTask struct {
type localDirtyPodStartTask struct {
ctx context.Context
userCred mcclient.TokenCredential
pod *sPodGuestInstance
}

func newLocalPodStartTask(ctx context.Context, userCred mcclient.TokenCredential, pod *sPodGuestInstance) *localPodStartTask {
return &localPodStartTask{
func newLocalDirtyPodStartTask(ctx context.Context, userCred mcclient.TokenCredential, pod *sPodGuestInstance) *localDirtyPodStartTask {
return &localDirtyPodStartTask{
ctx: ctx,
userCred: userCred,
pod: pod,
}
}

func (t *localPodStartTask) Run() {
func (t *localDirtyPodStartTask) Run() {
if t.pod.isPodDirtyShutdown() {
log.Infof("start pod locally (%s/%s)", t.pod.Id, t.pod.GetName())
log.Infof("start dirty pod locally (%s/%s)", t.pod.Id, t.pod.GetName())
if _, err := t.pod.startPod(t.ctx, t.userCred); err != nil {
log.Errorf("start pod(%s/%s) err: %s", t.pod.GetId(), t.pod.GetName(), err.Error())
log.Errorf("start dirty pod(%s/%s) err: %s", t.pod.GetId(), t.pod.GetName(), err.Error())
}
}
for _, ctr := range t.pod.GetContainers() {
if t.pod.isContainerDirtyShutdown(ctr.Id) {
log.Infof("start container locally (%s/%s/%s/%s)", t.pod.Id, t.pod.GetName(), ctr.Id, ctr.Name)
log.Infof("start dirty container locally (%s/%s/%s/%s)", t.pod.Id, t.pod.GetName(), ctr.Id, ctr.Name)
if _, err := t.pod.StartLocalContainer(t.ctx, t.userCred, ctr.Id); err != nil {
log.Errorf("start container %s err: %s", ctr.Id, err.Error())
log.Errorf("start dirty container %s err: %s", ctr.Id, err.Error())
}
}
}
t.pod.SyncStatus("sync status after pod start locally")
t.pod.SyncStatus("sync status after dirty pod start locally")
}

func (t *localPodStartTask) Dump() string {
func (t *localDirtyPodStartTask) Dump() string {
return fmt.Sprintf("pod start task %s/%s", t.pod.GetId(), t.pod.GetName())
}

func (s *sPodGuestInstance) StartLocalPod(ctx context.Context, userCred mcclient.TokenCredential) error {
s.manager.GuestStartWorker.Run(newLocalPodStartTask(ctx, userCred, s), nil, nil)
func (s *sPodGuestInstance) StartLocalDirtyPod(ctx context.Context, userCred mcclient.TokenCredential) error {
s.manager.GuestStartWorker.Run(newLocalDirtyPodStartTask(ctx, userCred, s), nil, nil)
return nil
}

Expand Down
10 changes: 10 additions & 0 deletions pkg/hostman/guestman/pod_sync_loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,17 @@ import (
)

func (m *SGuestManager) reconcileContainerLoop(cache runtime.Cache) {
log.Infof("start reconcile container loop")
for {
m.Servers.Range(func(id, obj interface{}) bool {
podObj, ok := obj.(*sPodGuestInstance)
if !ok {
return true
}
if podObj.isPodDirtyShutdown() {
log.Infof("pod %s is dirty shutdown, using dirty shutdown manager to start it", podObj.GetName())
return true
}
if err := m.reconcileContainer(podObj, cache); err != nil {
log.Warningf("reconcile pod %s: %v", podObj.GetId(), err)
}
Expand Down Expand Up @@ -108,6 +113,7 @@ func (m *SGuestManager) startContainer(obj *sPodGuestInstance, ctr *hostapi.Cont
}

func (m *SGuestManager) syncContainerLoop(plegCh chan *pleg.PodLifecycleEvent) {
log.Infof("start sync container loop")
for {
m.syncContainerLoopIteration(plegCh)
}
Expand All @@ -121,6 +127,10 @@ func (m *SGuestManager) syncContainerLoopIteration(plegCh chan *pleg.PodLifecycl
log.Warningf("can not find pod manager by %s", jsonutils.Marshal(e))
return
}
if podMan.(*sPodGuestInstance).isPodDirtyShutdown() {
log.Infof("pod %s is dirty shutdown, waiting it to started", podMan.GetName())
return
}
if e.Type == pleg.ContainerStarted {
log.Infof("pod container started: %s", jsonutils.Marshal(e))
ctrId := e.Data.(string)
Expand Down

0 comments on commit cc0b8ce

Please sign in to comment.