SparkContext作为整个Spark的入口,不管是spark、sparkstreaming、spark sql都需要首先创建一个SparkContext对象,然后基于这个SparkContext进行后续RDD的操作;所以很有必要了解下SparkContext在初始化时干了什么事情。
SparkContext初始化过程主要干了如下几件事情:
1、根据SparkContext的构造入参SparkConf创建SparkEnv;
2、初始化SparkUI;
3、创建TaskScheduler;
4、创建DAGScheduler;
5、启动taskScheduler;
通过源代码说明SparkContext初始化的过程
1、创建SparkEnv
private[spark] val env = SparkEnv.create( conf, "", conf.get("spark.driver.host"), conf.get("spark.driver.port").toInt, isDriver = true, isLocal = isLocal, listenerBus = listenerBus)SparkEnv.set(env)
2、初始化SparkUI
private[spark] val ui = new SparkUI(this)ui.bind()
3、创建TaskScheduler:根据spark的运行模式创建不同的SchedulerBackend
private[spark] var taskScheduler = SparkContext.createTaskScheduler(this, master)private def createTaskScheduler(sc: SparkContext, master: String): TaskScheduler = { val SPARK_REGEX = """spark://(.*)""".r master match { case SPARK_REGEX(sparkUrl) => val scheduler = new TaskSchedulerImpl(sc) val masterUrls = sparkUrl.split(",").map("spark://" + _) val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls) scheduler.initialize(backend) //为TaskSchedulerImpl中的backend变量初始化 scheduler }}TaskSchedulerImpl extends TaskScheduler{ var backend: SchedulerBackend = null def initialize(backend: SchedulerBackend) { this.backend = backend //将SparkDeploySchedulerBackend赋值给backend变量 rootPool = new Pool("", schedulingMode, 0, 0) schedulableBuilder = { schedulingMode match { case SchedulingMode.FIFO => //先进先出调度 new FIFOSchedulableBuilder(rootPool) case SchedulingMode.FAIR => //公平调度 new FairSchedulableBuilder(rootPool, conf) } } schedulableBuilder.buildPools() }}private[spark] class SparkDeploySchedulerBackend(scheduler: TaskSchedulerImpl,sc: SparkContext,masters: Array[String]) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) with AppClientListener with Logging { }
4、创建DAGScheduler:根据TaskScheduler创建DAGScheduler,用于接收提交过来的job
//根据TaskScheduler创建DAGScheduler,产生eventProcssActor(是DAGSchedule的通信载体,能接收和发送很多消息)@volatile private[spark] var dagScheduler: DAGScheduler = new DAGScheduler(this)class DAGScheduler{ def this(sc: SparkContext) = this(sc, sc.taskScheduler) private def initializeEventProcessActor() { implicit val timeout = Timeout(30 seconds) val initEventActorReply = dagSchedulerActorSupervisor ? Props(new DAGSchedulerEventProcessActor(this)) eventProcessActor = Await.result(initEventActorReply, timeout.duration). asInstanceOf[ActorRef] } initializeEventProcessActor()} //详细分析见DAGScheduler篇章private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGScheduler)extends Actor with Logging { { override def preStart() { dagScheduler.taskScheduler.setDAGScheduler(dagScheduler) } def receive = { case JobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite, listener, properties) => dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,listener, properties) ...... }}
5、启动taskScheduler
启动taskScheduler的主要目的是启动相应的SchedulerBackend,并判断是否进行推测式执行任务;
在启动TaskScheduler的过程中会创建Application并向Master发起注册请求;
taskScheduler.start() TaskSchedulerImpl extends TaskScheduler{ var backend: SchedulerBackend = null override def start() { backend.start() //spark.speculation... }}private[spark] class SparkDeploySchedulerBackend(scheduler: TaskSchedulerImpl,sc: SparkContext,masters: Array[String]) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) with AppClientListener with Logging { var client: AppClient = null val maxCores = conf.getOption("spark.cores.max").map(_.toInt) override def start() { super.start() //调用CoarseGrainedSchedulerBackend的start()方法 val driverUrl = "akka.tcp://spark@%s:%s/user/%s".format( conf.get("spark.driver.host"), conf.get("spark.driver.port"), CoarseGrainedSchedulerBackend.ACTOR_NAME) val command = Command( "org.apache.spark.executor.CoarseGrainedExecutorBackend", args, sc.executorEnvs, classPathEntries, libraryPathEntries, extraJavaOpts) val sparkHome = sc.getSparkHome() val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command, sparkHome, sc.ui.appUIAddress, sc.eventLogger.map(_.logDir)) client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf) client.start() }}class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: ActorSystem) extends SchedulerBackend with Logging var driverActor: ActorRef = null override def start() { driverActor = actorSystem.actorOf( Props(new DriverActor(properties)), name = CoarseGrainedSchedulerBackend.ACTOR_NAME) }}class ClientActor extends Actor with Logging{ override def preStart() { registerWithMaster() //向Master注册Application }}
CoarseGrainedSchedulerBackend与CoarseGrainedExecutorBackend通信
private[spark] class CoarseGrainedExecutorBackend(driverUrl: String, executorId: String, hostPort: String, cores: Int) extends Actor with ExecutorBackend with Logging { var executor: Executor = null var driver: ActorSelection = null override def preStart() { logInfo("Connecting to driver: " + driverUrl) driver = context.actorSelection(driverUrl) driver ! RegisterExecutor(executorId, hostPort, cores) //注册Executor,接收方是CoarseGrainedSchedulerBackend context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent]) } override def receive = { case RegisteredExecutor(sparkProperties) case LaunchTask(taskDesc) case KillTask(taskId, _, interruptThread) case StopExecutor }}