Submit stage 07/24 Update SLTechnology News&Howtos

Submit stage

2026-07-24 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Internet Technology >

Shulou(Shulou.com)06/03 Report--

//提交stage，为stage创建一批task，task数量和partition数量相同

private def submitMissingTasks(stage: Stage, jobId: Int) {

logDebug("submitMissingTasks(" + stage + ")")

// Get our pending tasks and remember them in our pendingTasks entry

stage.pendingTasks.clear()

// First figure out the indexes of partition ids to compute.

//获取要创建的task的数量

val partitionsToCompute: Seq[Int] = {

if (stage.isShuffleMap) {

(0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)

} else {

val job = stage.resultOfJob.get

(0 until job.numPartitions).filter(id => !job.finished(id))

}

val properties = if (jobIdToActiveJob.contains(jobId)) {

jobIdToActiveJob(stage.jobId).properties

} else {

// this stage will be assigned to "default" pool

null

}

//将stage加入runningstage队列

runningStages += stage

// SparkListenerStageSubmitted should be posted before testing whether tasks are

// serializable. If tasks are not serializable, a SparkListenerStageCompleted event

// will be posted, which should always come after a corresponding SparkListenerStageSubmitted

// event.

stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))

outputCommitCoordinator.stageStart(stage.id)

listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.

// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast

// the serialized copy of the RDD and for each task we will deserialize it, which means each

// task gets a different copy of the RDD. This provides stronger isolation between tasks that

// might modify state of objects referenced in their closures. This is necessary in Hadoop

// where the JobConf/Configuration object is not thread-safe.

var taskBinary: Broadcast[Array[Byte]] = null

try {

// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).

// For ResultTask, serialize and broadcast (rdd, func).

val taskBinaryBytes: Array[Byte] =

if (stage.isShuffleMap) {

closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()

} else {

closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()

}

taskBinary = sc.broadcast(taskBinaryBytes)

} catch {

// In the case of a failure during serialization, abort the stage.

case e: NotSerializableException =>

abortStage(stage, "Task not serializable: " + e.toString)

runningStages -= stage

return

case NonFatal(e) =>

abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")

runningStages -= stage

return

}

//为stage创建指定数量的task

val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {

partitionsToCompute.map { id =>

//给每个partition创建一个task

//给每个task计算最佳位置

val locs = getPreferredLocs(stage.rdd, id)

val part = stage.rdd.partitions(id)

//对于finalstage之外的stage的isShuffleMap都是true

//所以会创建ShuffleMapTask

new ShuffleMapTask(stage.id, taskBinary, part, locs)

}

} else {

//如果不是ShuffleMap，就会创建finalstage

//finalstage是穿件resultTask

val job = stage.resultOfJob.get

partitionsToCompute.map { id =>

val p: Int = job.partitions(id)

val part = stage.rdd.partitions(p)

//获取task计算的最佳位置的方法 getPreferredLocs

val locs = getPreferredLocs(stage.rdd, p)

new ResultTask(stage.id, taskBinary, part, locs, id)

}

if (tasks.size > 0) {

logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")

stage.pendingTasks ++= tasks

logDebug("New pending tasks: " + stage.pendingTasks)

taskScheduler.submitTasks(

new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))

stage.latestInfo.submissionTime = Some(clock.getTimeMillis())

} else {

// Because we posted SparkListenerStageSubmitted earlier, we should post

// SparkListenerStageCompleted here in case there are no tasks to run.

outputCommitCoordinator.stageEnd(stage.id)

listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))

logDebug("Stage " + stage + " is actually done; %b %d %d".format(

stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))

runningStages -= stage

}

def getPreferredLocs(rdd: RDD[_], partition: Int): Seq[TaskLocation] = {

getPreferredLocsInternal(rdd, partition, new HashSet)

}

//task对应partition的最佳位置

//就是从stage的最后一个RDD开始，找哪个RDD是被持久化了或者checkpoint

//那么task的最佳位置就是缓存的/checkpoint 的 partition的位置

//因为这样的话，task就在那个节点上执行，不需要计算之前的RDD

private def getPreferredLocsInternal(

rdd: RDD[_],

partition: Int,

visited: HashSet[(RDD[_],Int)])

: Seq[TaskLocation] =

{

// If the partition has already been visited, no need to re-visit.

// This avoids exponential path exploration. SPARK-695

if (!visited.add((rdd,partition))) {

// Nil has already been returned for previously visited partitions.

return Nil

}

// If the partition is cached, return the cache locations

//寻找当前RDD是否缓存了

val cached = getCacheLocs(rdd)(partition)

if (!cached.isEmpty) {

return cached

}

// If the RDD has some placement preferences (as is the case for input RDDs), get those

//寻找当前RDD是否checkpoint了

val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList

if (!rddPrefs.isEmpty) {

return rddPrefs.map(TaskLocation(_))

}

// If the RDD has narrow dependencies, pick the first partition of the first narrow dep

// that has any placement preferences. Ideally we would choose based on transfer sizes,

// but this will do for now.

//递归调用，看看父RDD是否缓存或者checkpoint

rdd.dependencies.foreach {

case n: NarrowDependency[_] =>

for (inPart

}

//如果从第一个RDD到最后一个RDD都没有缓存或者checkpoint，那最佳位置就是Nil，也就是没有最佳位置

//那他的位置就要由taskscheduler来分配

Nil

}

Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.

*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.