小记--------spark ——AGScheduler源码分析

yanqianglifei

2020-02-21

DAGScheduler类位置：org.apache.spark.scheduler

//DAGScheduler调度的核心入口

private[scheduler] def handleJobSubmitted(jobId: Int,
    finalRDD: RDD[_],
    func: (TaskContext, Iterator[_]) => _,
    partitions: Array[Int],
    callSite: CallSite,
    listener: JobListener,
    properties: Properties) {
// 第一步、使用触发job的最后一个rdd，创建finalStage
  var finalStage: ResultStage = null
  try {
    // New stage creation may throw an exception if, for example, jobs are run on a
    // HadoopRDD whose underlying HDFS files have been deleted.
 
    // 创建一个stage对象
    // 并将stage加入DAGScheduler内部的内存缓存中
    finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
  } catch {
    case e: Exception =>
      logWarning("Creating new stage failed due to exception - job: " + jobId, e)
      listener.jobFailed(e)
      return
  }
 
 // 第二步，用finalstage，创建一个job
 // 就是说，这个job的最后一个stage，当然就是我们的finalstage了
  val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
  clearCacheLocs()
  logInfo("Got job %s (%s) with %d output partitions".format(
    job.jobId, callSite.shortForm, partitions.length))
  logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
  logInfo("Parents of final stage: " + finalStage.parents)
  logInfo("Missing parents: " + getMissingParentStages(finalStage))
 
 
  val jobSubmissionTime = clock.getTimeMillis()
 
  // 第三部，将job加入内存缓存中
  jobIdToActiveJob(jobId) = job
  activeJobs += job
  finalStage.setActiveJob(job)
  val stageIds = jobIdToStageIds(jobId).toArray
  val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
  listenerBus.post(
    SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
 
  // 第四部，使用submitStage()方法提交finalStage 
  // 这个方法的调用，其实会导致第一个stage提交
  // 并且导致其他所有的stage，都给放入waitingstages队列里了
  submitStage(finalStage)//详细代码见：代码1
 
// stage划分算法，很重要，知道你自己编写spark application被划分为了几个job
// 每个job被划分成了几个stage
// 每个stage，包括了你的那些代码
// 只有知道了每个stage包括了你的那些代码之后
// 在线上， 如果你发现某个stage执行特别慢，或者某个stage一直报错，
// 你才能针对哪个stage对应的代码，去排查问题，或者是性能调优
 
// stage划分算法总结
// 1、从finalstage倒推
// 2.通过宽依赖，来进行新的stage的划分
// 3. 使用递归，优先提交父stage
}

代码1

/** Submits stage, but first recursively submits any missing parents. */
//提交stage的方法
// 这个其实就是stage划分算法的入口、
// 但是，stage划分算法，其实是由submitStage()方法与getMissingParentStages()方法共同组成的
private def submitStage(stage: Stage) {
  val jobId = activeJobForStage(stage)
  if (jobId.isDefined) {
    logDebug("submitStage(" + stage + ")")
    if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
        
     // 调用getMissingParentStages()方法，去获取当前这个stage的父stage
      val missing = getMissingParentStages(stage).sortBy(_.id)//详细代码见：代码2
      logDebug("missing: " + missing)
 
        // 这里其实会反复递归调用
        // 知道最初的stage，它没有父stage
        // 那么，此时，就是取首先提交这个第一个stage， stage0
        // 其余的stage，此时全部都在waitingstage里面
      if (missing.isEmpty) {
        logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
        submitMissingTasks(stage, jobId.get)//详细代码见代码4
      } else {
 
        // 递归调用submit方法，去提交父stage
        // 这里的递归，就是stage划分算法的推动者和精髓
        for (parent <- missing) {
          submitStage(parent)
        }
 
        // 并且将当前stage，放入waitingStage是等待执行的stage的队列中
        waitingStages += stage
      }
    }
  } else {
    abortStage(stage, "No active job for stage " + stage.id, None)
  }
}

代码2

// 获取某个stage的父stage
// 这个方法的意思就是，一个stage如果它的最后一个rdd的所有依赖，都是窄依赖，那么就不会创建任何新的stage。 但是，只要发现这个stage的rdd宽依赖了某个rdd， 那么就用宽依赖的那个rdd，创建一个新的stage，然后立即将新的stage返回
private def getMissingParentStages(stage: Stage): List[Stage] = {
  val missing = new HashSet[Stage]
  val visited = new HashSet[RDD[_]]
  // We are manually maintaining a stack here to prevent StackOverflowError
  // caused by recursively visiting
  val waitingForVisit = new Stack[RDD[_]]
  def visit(rdd: RDD[_]) {
    if (!visited(rdd)) {
      visited += rdd
      val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
      if (rddHasUncachedPartitions) {
        for (dep <- rdd.dependencies) {
          dep match {
            case shufDep: ShuffleDependency[_, _, _] =>
              val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
              if (!mapStage.isAvailable) {
                missing += mapStage
              }
            case narrowDep: NarrowDependency[_] =>
              waitingForVisit.push(narrowDep.rdd)
          }
        }
      }
    }
  }
 
  // 首先往栈中，推入了stage最后的一个rdd
  waitingForVisit.push(stage.rdd)
 
  // 然后进行while循环
  while (waitingForVisit.nonEmpty) {
 
    // 对stage的最后一rdd，调用自己内部定义的visit()方法
    visit(waitingForVisit.pop())//详细代码见：代码3
  }
  missing.toList
}

代码3

def visit(rdd: RDD[_]) {
  if (!visited(rdd)) {
    visited += rdd
    val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
    if (rddHasUncachedPartitions) {
 
     // 遍历rdd 的依赖
     // 其实杜宇每一种有shuffle的操作，比如groupByKey 、reduceByKey、countByKey
     // 底层对应了三个RDD：MapPartitionsRDD、shuffleRDD、MapPartitionsRDD 
      for (dep <- rdd.dependencies) {
        dep match {
 
        // 如果是宽依赖，
          case shufDep: ShuffleDependency[_, _, _] =>
 
            // 那么使用宽依赖的那个rdd，创建一个stage， 并且会将isShuffleMap设置为true
            // 默认最后一个stage，不是ShuffleMap Stage
            // 但是finalStage之前所有的stage，都是shuffleMap stage
            val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
            if (!mapStage.isAvailable) {
              missing += mapStage
            }
 
        // 如果是窄依赖，那么将依赖的rdd放入栈中
          case narrowDep: NarrowDependency[_] =>
            waitingForVisit.push(narrowDep.rdd)
        }
      }
    }
  }
}

代码4

// 提交stage，为stage创建一批task，task数量与partition数量相同
private def submitMissingTasks(stage: Stage, jobId: Int) {
  logDebug("submitMissingTasks(" + stage + ")")
  // Get our pending tasks and remember them in our pendingTasks entry
  stage.pendingPartitions.clear()
 
 
  // First figure out the indexes of partition ids to compute.
  // 获取你要创建的task的数量
  val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
 
 
  // Use the scheduling pool, job group, description, etc. from an ActiveJob associated
  // with this Stage
  val properties = jobIdToActiveJob(jobId).properties
 
 // 将stage加入runningstages队列
  runningStages += stage
  // SparkListenerStageSubmitted should be posted before testing whether tasks are
  // serializable. If tasks are not serializable, a SparkListenerStageCompleted event
  // will be posted, which should always come after a corresponding SparkListenerStageSubmitted
  // event.
  stage match {
    case s: ShuffleMapStage =>
      outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
    case s: ResultStage =>
      outputCommitCoordinator.stageStart(
        stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
  }
  val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
    stage match {
      case s: ShuffleMapStage =>
        partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
      case s: ResultStage =>
        partitionsToCompute.map { id =>
          val p = s.partitions(id)
          (id, getPreferredLocs(stage.rdd, p))
        }.toMap
    }
  } catch {
    case NonFatal(e) =>
      stage.makeNewStageAttempt(partitionsToCompute.size)
      listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
      abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
      runningStages -= stage
      return
  }
 
 
  stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
  listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
 
 
  // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
  // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
  // the serialized copy of the RDD and for each task we will deserialize it, which means each
  // task gets a different copy of the RDD. This provides stronger isolation between tasks that
  // might modify state of objects referenced in their closures. This is necessary in Hadoop
  // where the JobConf/Configuration object is not thread-safe.
  var taskBinary: Broadcast[Array[Byte]] = null
  try {
    // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
    // For ResultTask, serialize and broadcast (rdd, func).
    val taskBinaryBytes: Array[Byte] = stage match {
      case stage: ShuffleMapStage =>
        JavaUtils.bufferToArray(
          closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
      case stage: ResultStage =>
        JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
    }
 
 
    taskBinary = sc.broadcast(taskBinaryBytes)
  } catch {
    // In the case of a failure during serialization, abort the stage.
    case e: NotSerializableException =>
      abortStage(stage, "Task not serializable: " + e.toString, Some(e))
      runningStages -= stage
 
 
      // Abort execution
      return
    case NonFatal(e) =>
      abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e))
      runningStages -= stage
      return
  }
 
 // 为stage创建指定数量的task
 // task的最佳位置计算算法
  val tasks: Seq[Task[_]] = try {
    stage match {
      case stage: ShuffleMapStage =>
        partitionsToCompute.map { id =>
         //给每一个partition创建一个task。给每个task计算最佳位置
          val locs = taskIdToLocations(id)
          val part = stage.rdd.partitions(id)
 
          // 然后对于finalStage之外的stage，它的isShuffleMap都是true
          // 所以会创建ShuffleMapTask
          new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
            taskBinary, part, locs, stage.latestInfo.taskMetrics, properties, Option(jobId),
            Option(sc.applicationId), sc.applicationAttemptId)
        }
 
 
      case stage: ResultStage =>
        partitionsToCompute.map { id =>
          val p: Int = stage.partitions(id)
          val part = stage.rdd.partitions(p)
          val locs = taskIdToLocations(id)
          new ResultTask(stage.id, stage.latestInfo.attemptId,
            taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics,
            Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
        }
    }
  } catch {
    case NonFatal(e) =>
      abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
      runningStages -= stage
      return
  }
 
 
  if (tasks.size > 0) {
    logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
    stage.pendingPartitions ++= tasks.map(_.partitionId)
    logDebug("New pending partitions: " + stage.pendingPartitions)
    taskScheduler.submitTasks(new TaskSet(
      tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
    stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
  } else {
    // Because we posted SparkListenerStageSubmitted earlier, we should mark
    // the stage as completed here in case there are no tasks to run
    markStageAsFinished(stage, None)
 
 
    val debugString = stage match {
      case stage: ShuffleMapStage =>
        s"Stage ${stage} is actually done; " +
          s"(available: ${stage.isAvailable}," +
          s"available outputs: ${stage.numAvailableOutputs}," +
          s"partitions: ${stage.numPartitions})"
      case stage : ResultStage =>
        s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
    }
    logDebug(debugString)
 
 
    submitWaitingChildStages(stage)
  }
}

stage spark spark源码分析