Kube Controller Manager 源码分析（下）

JavaSoldier

2019-01-23

Kube Controller Manager 源码分析（上）

http://toutiao.com/item/6649523482604339726/

Deployment Controller Run 函数

资源初始化完毕后，就开始真正的Run 来看一下Run 函数

func (dc *DeploymentController) Run(workers int, stopCh &lt;-chan struct{}) {
 defer utilruntime.HandleCrash()
 defer dc.queue.ShutDown()
 klog.Infof("Starting deployment controller")
 defer klog.Infof("Shutting down deployment controller")
 if !controller.WaitForCacheSync("deployment", stopCh, dc.dListerSynced, dc.rsListerSynced, dc.podListerSynced) {
 return
 }
 for i := 0; i &lt; workers; i++ {
 go wait.Until(dc.worker, time.Second, stopCh)
 }
 &lt;-stopCh
}
func (dc *DeploymentController) worker() {
 for dc.processNextWorkItem() {
 }
}
func (dc *DeploymentController) processNextWorkItem() bool {
 key, quit := dc.queue.Get()
 if quit {
 return false
 }
 defer dc.queue.Done(key)
 err := dc.syncHandler(key.(string))
 dc.handleErr(err, key)
 return true
}

可以看到这个代码就是Client-go 里面标准版的Worker 消费者，不断的从Queue 里面拿Obj 然后调用syncHandler 处理，一起来看看最终的Handler如何处理

dc.syncHandler

func (dc *DeploymentController) syncDeployment(key string) error {
 startTime := time.Now()
 klog.V(4).Infof("Started syncing deployment %q (%v)", key, startTime)
 defer func() {
 klog.V(4).Infof("Finished syncing deployment %q (%v)", key, time.Since(startTime))
 }()
 namespace, name, err := cache.SplitMetaNamespaceKey(key)
 if err != nil {
 return err
 }
 deployment, err := dc.dLister.Deployments(namespace).Get(name)
 if errors.IsNotFound(err) {
 klog.V(2).Infof("Deployment %v has been deleted", key)
 return nil
 }
 if err != nil {
 return err
 }
 // Deep-copy otherwise we are mutating our cache.
 // TODO: Deep-copy only when needed.
 d := deployment.DeepCopy()
 everything := metav1.LabelSelector{}
 if reflect.DeepEqual(d.Spec.Selector, &amp;everything) {
 dc.eventRecorder.Eventf(d, v1.EventTypeWarning, "SelectingAll", "This deployment is selecting all pods. A non-empty selector is required.")
 if d.Status.ObservedGeneration &lt; d.Generation {
 d.Status.ObservedGeneration = d.Generation
 dc.client.AppsV1().Deployments(d.Namespace).UpdateStatus(d)
 }
 return nil
 }
 // List ReplicaSets owned by this Deployment, while reconciling ControllerRef
 // through adoption/orphaning.
 rsList, err := dc.getReplicaSetsForDeployment(d)
 if err != nil {
 return err
 }
 // List all Pods owned by this Deployment, grouped by their ReplicaSet.
 // Current uses of the podMap are:
 //
 // * check if a Pod is labeled correctly with the pod-template-hash label.
 // * check that no old Pods are running in the middle of Recreate Deployments.
 podMap, err := dc.getPodMapForDeployment(d, rsList)
 if err != nil {
 return err
 }
 if d.DeletionTimestamp != nil {
 return dc.syncStatusOnly(d, rsList)
 }
 // Update deployment conditions with an Unknown condition when pausing/resuming
 // a deployment. In this way, we can be sure that we won't timeout when a user
 // resumes a Deployment with a set progressDeadlineSeconds.
 if err = dc.checkPausedConditions(d); err != nil {
 return err
 }
 if d.Spec.Paused {
 return dc.sync(d, rsList)
 }
 // rollback is not re-entrant in case the underlying replica sets are updated with a new
 // revision so we should ensure that we won't proceed to update replica sets until we
 // make sure that the deployment has cleaned up its rollback spec in subsequent enqueues.
 if getRollbackTo(d) != nil {
 return dc.rollback(d, rsList)
 }
 scalingEvent, err := dc.isScalingEvent(d, rsList)
 if err != nil {
 return err
 }
 if scalingEvent {
 return dc.sync(d, rsList)
 }
 switch d.Spec.Strategy.Type {
 case apps.RecreateDeploymentStrategyType:
 return dc.rolloutRecreate(d, rsList, podMap)
 case apps.RollingUpdateDeploymentStrategyType:
 return dc.rolloutRolling(d, rsList)
 }
 return fmt.Errorf("unexpected deployment strategy type: %s", d.Spec.Strategy.Type)
}

根据Worker Queue 取出来的Namespace & Name 从Lister 内Query到真正的Deployment 对象
根据Deployment label 查询对应的ReplicaSet 列表
根据ReplicaSet label 查询对应的 Pod 列表，并生成一个key 为ReplicaSet ID Value 为PodList的Map 数据结构
判断当前Deployment 是否处于暂停状态
判断当前Deployment 是否处于回滚状态
根据更新策略Recreate 还是 RollingUpdate 决定对应的动作

这里我们以Recreate为例来看一下策略动作

func (dc *DeploymentController) rolloutRecreate(d *apps.Deployment, rsList []*apps.ReplicaSet, podMap map[types.UID]*v1.PodList) error {
 // Don't create a new RS if not already existed, so that we avoid scaling up before scaling down.
 newRS, oldRSs, err := dc.getAllReplicaSetsAndSyncRevision(d, rsList, false)
 if err != nil {
 return err
 }
 allRSs := append(oldRSs, newRS)
 activeOldRSs := controller.FilterActiveReplicaSets(oldRSs)
 // scale down old replica sets.
 scaledDown, err := dc.scaleDownOldReplicaSetsForRecreate(activeOldRSs, d)
 if err != nil {
 return err
 }
 if scaledDown {
 // Update DeploymentStatus.
 return dc.syncRolloutStatus(allRSs, newRS, d)
 }
 // Do not process a deployment when it has old pods running.
 if oldPodsRunning(newRS, oldRSs, podMap) {
 return dc.syncRolloutStatus(allRSs, newRS, d)
 }
 // If we need to create a new RS, create it now.
 if newRS == nil {
 newRS, oldRSs, err = dc.getAllReplicaSetsAndSyncRevision(d, rsList, true)
 if err != nil {
 return err
 }
 allRSs = append(oldRSs, newRS)
 }
 // scale up new replica set.
 if _, err := dc.scaleUpNewReplicaSetForRecreate(newRS, d); err != nil {
 return err
 }
 if util.DeploymentComplete(d, &amp;d.Status) {
 if err := dc.cleanupDeployment(oldRSs, d); err != nil {
 return err
 }
 }
 // Sync deployment status.
 return dc.syncRolloutStatus(allRSs, newRS, d)
}

根据ReplicaSet 获取当前所有的新老ReplicaSet
如果有老的ReplicaSet 那么先把老的ReplicaSet replicas 缩容设置为0，当然第一次创建的时候是没有老ReplicaSet的
如果第一次创建，那么需要去创建对应的ReplicaSet
创建完毕对应的ReplicaSet后扩容ReplicaSet 到对应的值
等待新建的创建完毕，清理老的ReplcaiSet
更新Deployment Status

下面我们看看第一次创建Deployment 的代码

func (dc *DeploymentController) getNewReplicaSet(d *apps.Deployment, rsList, oldRSs []*apps.ReplicaSet, createIfNotExisted bool) (*apps.ReplicaSet, error) {
 existingNewRS := deploymentutil.FindNewReplicaSet(d, rsList)
 // Calculate the max revision number among all old RSes
 maxOldRevision := deploymentutil.MaxRevision(oldRSs)
 // Calculate revision number for this new replica set
 newRevision := strconv.FormatInt(maxOldRevision+1, 10)
 // Latest replica set exists. We need to sync its annotations (includes copying all but
 // annotationsToSkip from the parent deployment, and update revision, desiredReplicas,
 // and maxReplicas) and also update the revision annotation in the deployment with the
 // latest revision.
 if existingNewRS != nil {
 rsCopy := existingNewRS.DeepCopy()
 // Set existing new replica set's annotation
 annotationsUpdated := deploymentutil.SetNewReplicaSetAnnotations(d, rsCopy, newRevision, true)
 minReadySecondsNeedsUpdate := rsCopy.Spec.MinReadySeconds != d.Spec.MinReadySeconds
 if annotationsUpdated || minReadySecondsNeedsUpdate {
 rsCopy.Spec.MinReadySeconds = d.Spec.MinReadySeconds
 return dc.client.AppsV1().ReplicaSets(rsCopy.ObjectMeta.Namespace).Update(rsCopy)
 }
 // Should use the revision in existingNewRS's annotation, since it set by before
 needsUpdate := deploymentutil.SetDeploymentRevision(d, rsCopy.Annotations[deploymentutil.RevisionAnnotation])
 // If no other Progressing condition has been recorded and we need to estimate the progress
 // of this deployment then it is likely that old users started caring about progress. In that
 // case we need to take into account the first time we noticed their new replica set.
 cond := deploymentutil.GetDeploymentCondition(d.Status, apps.DeploymentProgressing)
 if deploymentutil.HasProgressDeadline(d) &amp;&amp; cond == nil {
 msg := fmt.Sprintf("Found new replica set %q", rsCopy.Name)
 condition := deploymentutil.NewDeploymentCondition(apps.DeploymentProgressing, v1.ConditionTrue, deploymentutil.FoundNewRSReason, msg)
 deploymentutil.SetDeploymentCondition(&amp;d.Status, *condition)
 needsUpdate = true
 }
 if needsUpdate {
 var err error
 if d, err = dc.client.AppsV1().Deployments(d.Namespace).UpdateStatus(d); err != nil {
 return nil, err
 }
 }
 return rsCopy, nil
 }
 if !createIfNotExisted {
 return nil, nil
 }
 // new ReplicaSet does not exist, create one.
 newRSTemplate := *d.Spec.Template.DeepCopy()
 podTemplateSpecHash := controller.ComputeHash(&amp;newRSTemplate, d.Status.CollisionCount)
 newRSTemplate.Labels = labelsutil.CloneAndAddLabel(d.Spec.Template.Labels, apps.DefaultDeploymentUniqueLabelKey, podTemplateSpecHash)
 // Add podTemplateHash label to selector.
 newRSSelector := labelsutil.CloneSelectorAndAddLabel(d.Spec.Selector, apps.DefaultDeploymentUniqueLabelKey, podTemplateSpecHash)
 // Create new ReplicaSet
 newRS := apps.ReplicaSet{
 ObjectMeta: metav1.ObjectMeta{
 // Make the name deterministic, to ensure idempotence
 Name: d.Name + "-" + podTemplateSpecHash,
 Namespace: d.Namespace,
 OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(d, controllerKind)},
 Labels: newRSTemplate.Labels,
 },
 Spec: apps.ReplicaSetSpec{
 Replicas: new(int32),
 MinReadySeconds: d.Spec.MinReadySeconds,
 Selector: newRSSelector,
 Template: newRSTemplate,
 },
 }
 allRSs := append(oldRSs, &amp;newRS)
 newReplicasCount, err := deploymentutil.NewRSNewReplicas(d, allRSs, &amp;newRS)
 if err != nil {
 return nil, err
 }
 *(newRS.Spec.Replicas) = newReplicasCount
 // Set new replica set's annotation
 deploymentutil.SetNewReplicaSetAnnotations(d, &amp;newRS, newRevision, false)
 // Create the new ReplicaSet. If it already exists, then we need to check for possible
 // hash collisions. If there is any other error, we need to report it in the status of
 // the Deployment.
 alreadyExists := false
 createdRS, err := dc.client.AppsV1().ReplicaSets(d.Namespace).Create(&amp;newRS)

这里截取了部分重要代码

首先查询一下当前是否有对应的新的ReplicaSet
如果有那么仅仅需要更新Deployment Status 即可
如果没有那么创建对应的ReplicaSet 结构体
最后调用Client-go 创建对应的ReplicaSet 实例

后面还有一些代码这里就不贴了，核心思想就是，根据ReplicaSet的情况创建对应的新的ReplicaSet，其实看到使用Client-go 创建ReplicaSet Deployment 这里基本完成了使命，剩下的就是根据watch 改变一下Deployment 的状态了，至于真正的Pod 的创建，那么就得ReplicaSet Controller 来完成了。

ReplicaSet Controller

ReplicaSet Controller 和Deployment Controller 长得差不多，重复的部分我们就不多说，先看一下初始化的时候，ReplicaSet 主要关注哪些资源

func NewBaseController(rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int,
 gvk schema.GroupVersionKind, metricOwnerName, queueName string, podControl controller.PodControlInterface) *ReplicaSetController {
 if kubeClient != nil &amp;&amp; kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil {
 metrics.RegisterMetricAndTrackRateLimiterUsage(metricOwnerName, kubeClient.CoreV1().RESTClient().GetRateLimiter())
 }
 rsc := &amp;ReplicaSetController{
 GroupVersionKind: gvk,
 kubeClient: kubeClient,
 podControl: podControl,
 burstReplicas: burstReplicas,
 expectations: controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()),
 queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), queueName),
 }
 rsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
 AddFunc: rsc.enqueueReplicaSet,
 UpdateFunc: rsc.updateRS,
 // This will enter the sync loop and no-op, because the replica set has been deleted from the store.
 // Note that deleting a replica set immediately after scaling it to 0 will not work. The recommended
 // way of achieving this is by performing a `stop` operation on the replica set.
 DeleteFunc: rsc.enqueueReplicaSet,
 })
 rsc.rsLister = rsInformer.Lister()
 rsc.rsListerSynced = rsInformer.Informer().HasSynced
 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
 AddFunc: rsc.addPod,
 // This invokes the ReplicaSet for every pod change, eg: host assignment. Though this might seem like
 // overkill the most frequent pod update is status, and the associated ReplicaSet will only list from
 // local storage, so it should be ok.
 UpdateFunc: rsc.updatePod,
 DeleteFunc: rsc.deletePod,
 })
 rsc.podLister = podInformer.Lister()
 rsc.podListerSynced = podInformer.Informer().HasSynced
 rsc.syncHandler = rsc.syncReplicaSet
 return rsc
}

可以看到ReplicaSet Controller 主要关注所有的ReplicaSet Pod的创建，他们的处理逻辑是一样的，都是根据触发函数，找到对应的ReplicaSet实例后，将对应的ReplicaSet 实例放到Worker Queue里面去。

syncReplicaSet

这里我们直接来看ReplicaSet Controller 的真正处理函数

func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
 startTime := time.Now()
 defer func() {
 klog.V(4).Infof("Finished syncing %v %q (%v)", rsc.Kind, key, time.Since(startTime))
 }()
 namespace, name, err := cache.SplitMetaNamespaceKey(key)
 if err != nil {
 return err
 }
 rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
 if errors.IsNotFound(err) {
 klog.V(4).Infof("%v %v has been deleted", rsc.Kind, key)
 rsc.expectations.DeleteExpectations(key)
 return nil
 }
 if err != nil {
 return err
 }
 rsNeedsSync := rsc.expectations.SatisfiedExpectations(key)
 selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
 if err != nil {
 utilruntime.HandleError(fmt.Errorf("Error converting pod selector to selector: %v", err))
 return nil
 }
 // list all pods to include the pods that don't match the rs`s selector
 // anymore but has the stale controller ref.
 // TODO: Do the List and Filter in a single pass, or use an index.
 allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
 if err != nil {
 return err
 }
 // Ignore inactive pods.
 var filteredPods []*v1.Pod
 for _, pod := range allPods {
 if controller.IsPodActive(pod) {
 filteredPods = append(filteredPods, pod)
 }
 }
 // NOTE: filteredPods are pointing to objects from cache - if you need to
 // modify them, you need to copy it first.
 filteredPods, err = rsc.claimPods(rs, selector, filteredPods)
 if err != nil {
 return err
 }
 var manageReplicasErr error
 if rsNeedsSync &amp;&amp; rs.DeletionTimestamp == nil {
 manageReplicasErr = rsc.manageReplicas(filteredPods, rs)
 }
 rs = rs.DeepCopy()
 newStatus := calculateStatus(rs, filteredPods, manageReplicasErr)

根据从Worker Queue 得到的Name 获取真正的ReplicaSet 实例
根据ReplicaSet Label 获取对应的所有的Pod List
将所有的Running Pod 遍历出来
根据Pod 情况判断是否需要创建 Pod
将新的状态更新到ReplicaSet Status 字段中

manageReplicas

我们主要来看一眼创建Pod 的函数

func (rsc *ReplicaSetController) manageReplicas(filteredPods []*v1.Pod, rs *apps.ReplicaSet) error {
 diff := len(filteredPods) - int(*(rs.Spec.Replicas))
 rsKey, err := controller.KeyFunc(rs)
 if err != nil {
 utilruntime.HandleError(fmt.Errorf("Couldn't get key for %v %#v: %v", rsc.Kind, rs, err))
 return nil
 }
 if diff &lt; 0 {
 diff *= -1
 if diff &gt; rsc.burstReplicas {
 diff = rsc.burstReplicas
 }
 // TODO: Track UIDs of creates just like deletes. The problem currently
 // is we'd need to wait on the result of a create to record the pod's
 // UID, which would require locking *across* the create, which will turn
 // into a performance bottleneck. We should generate a UID for the pod
 // beforehand and store it via ExpectCreations.
 rsc.expectations.ExpectCreations(rsKey, diff)
 klog.V(2).Infof("Too few replicas for %v %s/%s, need %d, creating %d", rsc.Kind, rs.Namespace, rs.Name, *(rs.Spec.Replicas), diff)
 // Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize
 // and double with each successful iteration in a kind of "slow start".
 // This handles attempts to start large numbers of pods that would
 // likely all fail with the same error. For example a project with a
 // low quota that attempts to create a large number of pods will be
 // prevented from spamming the API service with the pod create requests
 // after one of its pods fails. Conveniently, this also prevents the
 // event spam that those failures would generate.
 successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error {
 boolPtr := func(b bool) *bool { return &amp;b }
 controllerRef := &amp;metav1.OwnerReference{
 APIVersion: rsc.GroupVersion().String(),
 Kind: rsc.Kind,
 Name: rs.Name,
 UID: rs.UID,
 BlockOwnerDeletion: boolPtr(true),
 Controller: boolPtr(true),
 }
 err := rsc.podControl.CreatePodsWithControllerRef(rs.Namespace, &amp;rs.Spec.Template, rs, controllerRef)
 if err != nil &amp;&amp; errors.IsTimeout(err) {
 // Pod is created but its initialization has timed out.
 // If the initialization is successful eventually, the
 // controller will observe the creation via the informer.
 // If the initialization fails, or if the pod keeps
 // uninitialized for a long time, the informer will not
 // receive any update, and the controller will create a new
 // pod when the expectation expires.
 return nil
 }
 return err
 })
 // Any skipped pods that we never attempted to start shouldn't be expected.
 // The skipped pods will be retried later. The next controller resync will
 // retry the slow start process.
 if skippedPods := diff - successfulCreations; skippedPods &gt; 0 {
 klog.V(2).Infof("Slow-start failure. Skipping creation of %d pods, decrementing expectations for %v %v/%v", skippedPods, rsc.Kind, rs.Namespace, rs.Name)
 for i := 0; i &lt; skippedPods; i++ {
 // Decrement the expected number of creates because the informer won't observe this pod
 rsc.expectations.CreationObserved(rsKey)
 }
 }
 return err
 } else if diff &gt; 0 {
 if diff &gt; rsc.burstReplicas {
 diff = rsc.burstReplicas
 }
 klog.V(2).Infof("Too many replicas for %v %s/%s, need %d, deleting %d", rsc.Kind, rs.Namespace, rs.Name, *(rs.Spec.Replicas), diff)
 // Choose which Pods to delete, preferring those in earlier phases of startup.
 podsToDelete := getPodsToDelete(filteredPods, diff)
 // Snapshot the UIDs (ns/name) of the pods we're expecting to see
 // deleted, so we know to record their expectations exactly once either
 // when we see it as an update of the deletion timestamp, or as a delete.
 // Note that if the labels on a pod/rs change in a way that the pod gets
 // orphaned, the rs will only wake up after the expectations have
 // expired even if other pods are deleted.
 rsc.expectations.ExpectDeletions(rsKey, getPodKeys(podsToDelete))
 errCh := make(chan error, diff)
 var wg sync.WaitGroup
 wg.Add(diff)
 for _, pod := range podsToDelete {
 go func(targetPod *v1.Pod) {
 defer wg.Done()
 if err := rsc.podControl.DeletePod(rs.Namespace, targetPod.Name, rs); err != nil {
 // Decrement the expected number of deletes because the informer won't observe this deletion
 podKey := controller.PodKey(targetPod)
 klog.V(2).Infof("Failed to delete %v, decrementing expectations for %v %s/%s", podKey, rsc.Kind, rs.Namespace, rs.Name)
 rsc.expectations.DeletionObserved(rsKey, podKey)
 errCh &lt;- err
 }
 }(pod)
 }
 wg.Wait()

这里的逻辑就非常简单的，基本上就是根据当前Running Pod 数量和真正的replicas 声明比对，如果少了那么就调用Client-go 创建Pod ，如果多了就调用CLient-go 去删除 Pod。

总结

至此，一个Deployment -> ReplicaSet -> Pod 就真正的创建完毕。当Pod 被删除时候，ReplicaSet Controller 就会把 Pod 拉起来。如果更新Deployment 就会创建新的ReplicaSet 一层层嵌套多个Controller 结合完成最终的 Pod 创建。当然，这里其实仅仅完成了Pod 数据写入到ETCD，其实真正的 Pod 实例并没有创建，还需要scheduler & kubelet 配合完成，我们会在后面的章节继续介绍。

作者：xianlubird

源码 return key dc

安科网

Kube Controller Manager 源码分析（下）

JavaSoldier

ReplicaSet Controller

总结

JavaSoldier

相关推荐

Java+Linux，深入内核源码讲解多线程之进程

java Activiti工作流引擎 websocket即时聊天发图片文字好友群组 SSM源码

为Linux的cp和mv命令添加进度条

彻底搞懂Node.js中的Require机制

Vue源码中值得学习的方法

SpringBoot外化配置源码解析：综合实战演示参数及配置

从Linux源码看Socket(TCP)的Listen及连接队列

Redux源码解析系列 (二)-- 牛鼻的createStore

Spring源码之Bean实例化基本原理

从Linux源码看Socket(TCP)的Bind

Vue进阶面试必问，异步更新机制和nextTick原理

通过源码理解Rarp协议（基于linux1.2.13）

重新认识Typescript | Vue3源码系列

Linux下Python3.6的安装及避坑指南

源码分析C++的string的实现

干货ReentrantLock非公平锁源码分析

学会反射后，我被录取了！

搭建一对一直播平台，选择直播系统源码，这几点不容忽视

关于Redis网络模型的源码详析

koa中间件核心（koa-compose）源码解读分析

JavaSoldier