runc源码分析(一)-create和start流程-v1.0.0-rc2

流程简述

先简要地描述create和start的流程，以方便后面的代码分析。
我们使用runc create创建容器，使用runc start启动容器。主要流程如下：

运行runc create时，后台生成该命令的进程，我们称该进程为parent；
parent进程中运行runc init，我们称runc init进程为child进程；
child进程开始准备用户进程的运行环境，此时parent和child进程通过pipe进行通信；
child进程准备好用户进程的运行环境后，通知parent退出，自己则被exec.fifo阻塞；
由于parent退出(即runc create退出)，child成孤独进程，进而被1进程接收；
child进程一直被exec.fifo阻塞；
运行runc start时，会打开exec.fifo，使child的阻塞消除，runc start退出；
由于阻塞消除，child进程继续往下执行；
child进程使用用户定义的命令替换runc init，从而child进程成为容器内的主进程；
容器启动完成。

runc create

先来看runc create的执行流程，主要的运行流程定义在/create.go中：

Action: func(context *cli.Context) error {
	spec, err := setupSpec(context)
	if err != nil {
		return err
	}
	status, err := startContainer(context, spec, true)
	if err != nil {
		return err
	}
	// exit with the container's exit status so any external supervisor is
	// notified of the exit with the correct exit status.
	os.Exit(status)
	return nil
}

可以看出，runc create主要调用了startContainer()函数，startContainer()函数定义在/utils_linux.go中：

func startContainer(context *cli.Context, spec *specs.Spec, create bool) (int, error) {
	id := context.Args().First()
	if id == "" {
		return -1, errEmptyID
	}
	//***调用createContainer()创建linuxContainer***//
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
	detach := context.Bool("detach")
	// Support on-demand socket activation by passing file descriptors into the container init process.
	listenFDs := []*os.File{}
	if os.Getenv("LISTEN_FDS") != "" {
		listenFDs = activation.Files(false)
	}
	r := &runner{
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   true,
		container:       container,
		listenFDs:       listenFDs,
		console:         context.String("console"),
		detach:          detach,
		pidFile:         context.String("pid-file"),
		create:          create,
	}
	return r.run(&spec.Process)
}

startContainer()先调用createContainer()创建container，然后使用runner.run()。

先来看createContainer()，定义在同文件中：

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
	config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
		CgroupName:       id,
		UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
		NoPivotRoot:      context.Bool("no-pivot"),
		NoNewKeyring:     context.Bool("no-new-keyring"),
		Spec:             spec,
	})
	if err != nil {
		return nil, err
	}
	if _, err := os.Stat(config.Rootfs); err != nil {
		if os.IsNotExist(err) {
			return nil, fmt.Errorf("rootfs (%q) does not exist", config.Rootfs)
		}
		return nil, err
	}
	//***生成LinuxFactory***//
	factory, err := loadFactory(context)
	if err != nil {
		return nil, err
	}
	//***调用LinuxFactory的Create()方法***//
	return factory.Create(id, config)
}

createContainer()先调用loadFactory()生成factory，然后再调用factory.Create()方法生成container。关于factory，后面介绍。

再来看runner.run()：

type runner struct {
	enableSubreaper bool
	shouldDestroy   bool
	detach          bool
	listenFDs       []*os.File
	pidFile         string
	console         string
	container       libcontainer.Container
	create          bool
}
func (r *runner) run(config *specs.Process) (int, error) {
	//***生成process***//
	process, err := newProcess(*config)
	if err != nil {
		r.destroy()
		return -1, err
	}
	if len(r.listenFDs) > 0 {
		process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
	}
	rootuid, err := r.container.Config().HostUID()
	if err != nil {
		r.destroy()
		return -1, err
	}
	rootgid, err := r.container.Config().HostGID()
	if err != nil {
		r.destroy()
		return -1, err
	}
	tty, err := setupIO(process, rootuid, rootgid, r.console, config.Terminal, r.detach || r.create)
	if err != nil {
		r.destroy()
		return -1, err
	}
	handler := newSignalHandler(tty, r.enableSubreaper)
	//***如果是create，则startFn为r.container.Start***//
	//***如果是run，则startFn为r.container.Run***//
	startFn := r.container.Start
	if !r.create {
		startFn = r.container.Run
	}
	defer tty.Close()
	if err := startFn(process); err != nil {
		r.destroy()
		return -1, err
	}
	if err := tty.ClosePostStart(); err != nil {
		r.terminate(process)
		r.destroy()
		return -1, err
	}
	if r.pidFile != "" {
		if err := createPidFile(r.pidFile, process); err != nil {
			r.terminate(process)
			r.destroy()
			return -1, err
		}
	}
	if r.detach || r.create {
		return 0, nil
	}
	status, err := handler.forward(process)
	if err != nil {
		r.terminate(process)
	}
	r.destroy()
	return status, err
}

run()先生成process，当命令是runc create时，则使用container.Start()来运行process；当命令是runc run时，则使用container.Run()来运行process，然后创建pid文件并写入pid。

factory.Create()

factory是整个libcontainer的入口，而factory又有两个入口：Create()和StartInitialization()。先来看factory的Create()入口，定义在/libcontainer/factory_linux.go中：

//***在容器外面执行***//
//***生成逻辑上的容器***//
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
	if l.Root == "" {
		return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
	}
	//***验证id***//
	if err := l.validateID(id); err != nil {
		return nil, err
	}
	//***验证config***//
	if err := l.Validator.Validate(config); err != nil {
		return nil, newGenericError(err, ConfigInvalid)
	}
	uid, err := config.HostUID()
	if err != nil {
		return nil, newGenericError(err, SystemError)
	}
	gid, err := config.HostGID()
	if err != nil {
		return nil, newGenericError(err, SystemError)
	}
	//***创建containerRoot目录***//
	//***containerRoot:  /run/runc/nginx***//
	containerRoot := filepath.Join(l.Root, id)
	if _, err := os.Stat(containerRoot); err == nil {
		return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
	} else if !os.IsNotExist(err) {
		return nil, newGenericError(err, SystemError)
	}
	//***设置containerRoot目录，权限为0700***//
	if err := os.MkdirAll(containerRoot, 0711); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	if err := os.Chown(containerRoot, uid, gid); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	//***创建exec.fifo，即/run/runc/nginx/exec.fifo***//
	fifoName := filepath.Join(containerRoot, execFifoFilename)
	oldMask := syscall.Umask(0000)
	if err := syscall.Mkfifo(fifoName, 0622); err != nil {
		syscall.Umask(oldMask)
		return nil, newGenericError(err, SystemError)
	}
	syscall.Umask(oldMask)
	if err := os.Chown(fifoName, uid, gid); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	//***生成linuxContainer***//
	//***c:  &{nginx /run/runc/nginx 0xc420082800 0xc420015c40 [/proc/self/exe init] <nil>  criu {0 0} 0 <nil> {0 0 <nil>}}***//
	c := &linuxContainer{
		id:       id,
		root:     containerRoot,
		config:   config,
		initArgs: l.InitArgs,
		criuPath: l.CriuPath,
		//***设置Cgroup***//
		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
	}
	c.state = &stoppedState{c: c}
	return c, nil
}

factory的Create()方法流程如下：

验证id；
验证config；
创建containerRoot目录，如/run/run/nginx；
创建exec.fifo，exec.fifo是一个管道文件，只有写时会被阻塞，读写都在时才会正常运行；
生成container，其中initArgs为/proc/self/exe init，/proc/self/exec即为程序本身——runc，然后设置为stop状态。

linuxContainer.Start()

再来看linuxContainer.Start()是如何运行process的。linuxContainer.Start()定义在/libcontainer/container_linux.go中：

//***执行一个Process***//
//***在容器外部执行***//
func (c *linuxContainer) Start(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	status, err := c.currentStatus()
	if err != nil {
		return err
	}
	return c.start(process, status == Stopped)
}

可以看出，Start()主要调用了start()方法，其中如果容器的状态为Stopped，则start()的isInit为true，即Stopped状态的container需要重新init。start()方法如下：

func (c *linuxContainer) start(process *Process, isInit bool) error {
	//***doInit为true时，parent为initProcess***//
	//***doInit为false时，parent为setnsProcess***//
	parent, err := c.newParentProcess(process, isInit)
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
	}
	//***doInit为true时，调用initProcess的start()***//
	if err := parent.start(); err != nil {
		// terminate the process to ensure that it properly is reaped.
		if err := parent.terminate(); err != nil {
			logrus.Warn(err)
		}
		return newSystemErrorWithCause(err, "starting container process")
	}
	// generate a timestamp indicating when the container was started
	c.created = time.Now().UTC()
	c.state = &runningState{
		c: c,
	}
	if isInit {
		c.state = &createdState{
			c: c,
		}
		state, err := c.updateState(parent)
		if err != nil {
			return err
		}
		c.initProcessStartTime = state.InitProcessStartTime
		//***调用config中定义的hook***//
		if c.config.Hooks != nil {
			s := configs.HookState{
				Version:    c.config.Version,
				ID:         c.id,
				Pid:        parent.pid(),
				Root:       c.config.Rootfs,
				BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
			}
			for i, hook := range c.config.Hooks.Poststart {
				if err := hook.Run(s); err != nil {
					if err := parent.terminate(); err != nil {
						logrus.Warn(err)
					}
					return newSystemErrorWithCausef(err, "running poststart hook %d", i)
				}
			}
		}
	}
	return nil
}

create()的执行流程如下：

调用newParentProcess()生成parent。newParent会依据容器的状态生成initProcess或setnsProcess，runc create的parent为initProcess，newParentProcess()会把parent中的命令填充成/proc/self/exe init；
调用parent.start()启动parent，即启动initProcess；
设置container的状态为Created；
调用config.json文件中定义的Hookb函数。

initProcess.start()

initProcess.start()定义在process_linux.go中：

//***在容器外部执行,新建容器执行命令***//
func (p *initProcess) start() error {
	defer p.parentPipe.Close()
	//***开始运行进程***//
	//***&{/proc/self/exe [/proc/self/exe init] [_LIBCONTAINER_INITPIPE=3 _LIBCONTAINER_STATEDIR=4 _LIBCONTAINER_INITTYPE=standard] /home/fankang/mycontainer/rootfs 0xc420026008 0xc420026010 0xc420026018 [0xc420026118 0xc420026120] 0xc420088240 <nil> <nil> <nil> <nil> false [] [] [] [] <nil> <nil>}***//
	err := p.cmd.Start()
	p.process.ops = p
	p.childPipe.Close()
	p.rootDir.Close()
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	}
	//***向parentPipe发送bootstrapData***//
	//***nsexec.c会拿到bootstrapData***//
	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
		return err
	}
	//***等待进程namepace等信息设置完成***//
	if err := p.execSetns(); err != nil {
		return newSystemErrorWithCause(err, "running exec setns process for init")
	}
	// Save the standard descriptor names before the container process
	// can potentially move them (e.g., via dup2()).  If we don't do this now,
	// we won't know at checkpoint time which file descriptor to look up.
	//***获取管道***//
	fds, err := getPipeFds(p.pid())
	if err != nil {
		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
	}
	p.setExternalDescriptors(fds)
	// Do this before syncing with child so that no children
	// can escape the cgroup
	//***把进程pid加入到cgroup中管理***//
	//***调用的是/cgroup/systemd/apply_systemd.go***//
	if err := p.manager.Apply(p.pid()); err != nil {
		return newSystemErrorWithCause(err, "applying cgroup configuration for process")
	}
	defer func() {
		if err != nil {
			// TODO: should not be the responsibility to call here
			p.manager.Destroy()
		}
	}()
	//***初始化容器网络***//
	if err := p.createNetworkInterfaces(); err != nil {
		return newSystemErrorWithCause(err, "creating nework interfaces")
	}
	//***通过管道发送配置文件给子进程***//
	//***子进程获取config后，才能往下执行***//
	if err := p.sendConfig(); err != nil {
		return newSystemErrorWithCause(err, "sending config to init process")
	}
	var (
		procSync   syncT
		sentRun    bool
		sentResume bool
		ierr       *genericError
	)
	dec := json.NewDecoder(p.parentPipe)
loop:
	for {
		//***从parentPipe中获取json对象***//
		if err := dec.Decode(&procSync); err != nil {
			if err == io.EOF {
				break loop
			}
			return newSystemErrorWithCause(err, "decoding sync type from init pipe")
		}
		switch procSync.Type {
		case procReady:
			if err := p.manager.Set(p.config.Config); err != nil {
				return newSystemErrorWithCause(err, "setting cgroup config for ready process")
			}
			// set oom_score_adj
			if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
				return newSystemErrorWithCause(err, "setting oom score for ready process")
			}
			// set rlimits, this has to be done here because we lose permissions
			// to raise the limits once we enter a user-namespace
			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
				return newSystemErrorWithCause(err, "setting rlimits for ready process")
			}
			// call prestart hooks
			if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
				if p.config.Config.Hooks != nil {
					s := configs.HookState{
						Version: p.container.config.Version,
						ID:      p.container.id,
						Pid:     p.pid(),
						Root:    p.config.Config.Rootfs,
					}
					for i, hook := range p.config.Config.Hooks.Prestart {
						if err := hook.Run(s); err != nil {
							return newSystemErrorWithCausef(err, "running prestart hook %d", i)
						}
					}
				}
			}
			// Sync with child.
			//***和子进程进行互动***//
			if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
				return newSystemErrorWithCause(err, "reading syncT run type")
			}
			sentRun = true
		case procHooks:
			if p.config.Config.Hooks != nil {
				s := configs.HookState{
					Version:    p.container.config.Version,
					ID:         p.container.id,
					Pid:        p.pid(),
					Root:       p.config.Config.Rootfs,
					BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
				}
				for i, hook := range p.config.Config.Hooks.Prestart {
					if err := hook.Run(s); err != nil {
						return newSystemErrorWithCausef(err, "running prestart hook %d", i)
					}
				}
			}
			// Sync with child.
			if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
				return newSystemErrorWithCause(err, "reading syncT resume type")
			}
			sentResume = true
		case procError:
			// wait for the child process to fully complete and receive an error message
			// if one was encoutered
			if err := dec.Decode(&ierr); err != nil && err != io.EOF {
				return newSystemErrorWithCause(err, "decoding proc error from init")
			}
			if ierr != nil {
				break loop
			}
			// Programmer error.
			panic("No error following JSON procError payload.")
		default:
			return newSystemError(fmt.Errorf("invalid JSON payload from child"))
		}
	}
	if !sentRun {
		return newSystemErrorWithCause(ierr, "container init")
	}
	if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
		return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
	}
	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
		return newSystemErrorWithCause(err, "shutting down init pipe")
	}
	// Must be done after Shutdown so the child will exit and we can wait for it.
	if ierr != nil {
		p.wait()
		return ierr
	}
	return nil
}

start()的主要流程如下：

启动initProcess，即parent中的命令——runc init，也就是child进程；
获取parentPipe；
调用execSetns()等待进程namespace等信息完成，详见runc的下一篇分析；
调用Apply()生成cgroup相关文件；
调用createNetworkInterfaces()初始化容器网络；
调用sendConfig()把配置发送给child进程，子进程在获取配置后才会往下执行；
通过pipe和child进程进行交互：
7.1 当收到child进程的proReady信号，则调用Set()在cgroup中添加资源限制；
7.2 当收到procHooks时，则执行config.json定义的prestart hooks；
7.3 当child进程把pipe关闭时，则parent退出交互；
返回，也就意味着runc create进程结束。

此时child进程运行着/proc/self/exe init还在执行中。

/proc/self/exe init

所以我们来分析runc init的流程，runc init的流程定义在/main_unix.go中：

Action: func(context *cli.Context) error {
	factory, _ := libcontainer.New("")
	if err := factory.StartInitialization(); err != nil {
		// as the error is sent back to the parent there is no need to log
		// or write it to stderr because the parent process will handle this
		os.Exit(1)
	}
	panic("libcontainer: container init failed to exec")
}

可以看出，runc init主要调用了factory的第二个入口：factory.StartInitialization()。

factory.StartInitialization()

factory.StartInitialization()定义在/libcontainer/factory_linux.go中：

//***容器内初始化函数,在容器内部执行的***//
func (l *LinuxFactory) StartInitialization() (err error) {
	//***获取管道***//
	var pipefd, rootfd int
	for _, pair := range []struct {
		k string
		v *int
	}{
		{"_LIBCONTAINER_INITPIPE", &pipefd},
		{"_LIBCONTAINER_STATEDIR", &rootfd},
	} {
		s := os.Getenv(pair.k)
		i, err := strconv.Atoi(s)
		if err != nil {
			return fmt.Errorf("unable to convert %s=%s to int", pair.k, s)
		}
		*pair.v = i
	}
	var (
		pipe = os.NewFile(uintptr(pipefd), "pipe")
		it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
	)
	// clear the current process's environment to clean any libcontainer
	// specific env vars.
	os.Clearenv()
	var i initer
	defer func() {
		// We have an error during the initialization of the container's init,
		// send it back to the parent process in the form of an initError.
		// If container's init successed, syscall.Exec will not return, hence
		// this defer function will never be called.
		if _, ok := i.(*linuxStandardInit); ok {
			//  Synchronisation only necessary for standard init.
			//***把错误信息通知parent***//
			if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
				panic(err)
			}
		}
		if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
			panic(err)
		}
		// ensure that this pipe is always closed
		pipe.Close()
	}()
	defer func() {
		if e := recover(); e != nil {
			err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
		}
	}()
	//***newContainerInit()定义在init_linux.go中***//
	i, err = newContainerInit(it, pipe, rootfd)
	if err != nil {
		return err
	}
	//***执行Init()***//
	return i.Init()
}

factory.StartInitialization()的主要流程如下：

从环境变量中获取pipe(环境变量在newParentProcess()中设置)及init的类型；
调用newContainerInit()生成init：linuxSetnsInit或initStandardInit；
调用init的Init()方法。

newContainerInit()定义在/libcontainer/init_linux.go中：

func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
	var config *initConfig
	//***通过管道获取配置信息***//
	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
		return nil, err
	}
	//***从配置信息中获取环境变量并设置为容器内环境变量***//
	if err := populateProcessEnvironment(config.Env); err != nil {
		return nil, err
	}
	switch t {
	case initSetns:
		return &linuxSetnsInit{
			config: config,
		}, nil
	case initStandard:
		return &linuxStandardInit{
			pipe:       pipe,
			parentPid:  syscall.Getppid(),
			config:     config,
			stateDirFD: stateDirFD,
		}, nil
	}
	return nil, fmt.Errorf("unknown init type %q", t)
}

newContainerInit()会一直阻塞到config信息过来。
当运行runc create时生成的是linuxStandardInit。

linuxStandardInit

linuxStandardInit定义在/libcontainer/standard_init_linux.go中：

type linuxStandardInit struct {
	pipe       io.ReadWriteCloser
	parentPid  int
	stateDirFD int
	config     *initConfig
}

其主要有Init()方法：

func (l *linuxStandardInit) Init() error {
	if !l.config.Config.NoNewKeyring {
		ringname, keepperms, newperms := l.getSessionRingParams()
		// do not inherit the parent's session keyring
		sessKeyId, err := keys.JoinSessionKeyring(ringname)
		if err != nil {
			return err
		}
		// make session keyring searcheable
		if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
			return err
		}
	}
	var console *linuxConsole
	if l.config.Console != "" {
		console = newConsoleFromPath(l.config.Console)
		if err := console.dupStdio(); err != nil {
			return err
		}
	}
	if console != nil {
		if err := system.Setctty(); err != nil {
			return err
		}
	}
	if err := setupNetwork(l.config); err != nil {
		return err
	}
	if err := setupRoute(l.config.Config); err != nil {
		return err
	}
	label.Init()
	// InitializeMountNamespace() can be executed only for a new mount namespace
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
			return err
		}
	}
	if hostname := l.config.Config.Hostname; hostname != "" {
		if err := syscall.Sethostname([]byte(hostname)); err != nil {
			return err
		}
	}
	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
		return err
	}
	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
		return err
	}
	for key, value := range l.config.Config.Sysctl {
		if err := writeSystemProperty(key, value); err != nil {
			return err
		}
	}
	for _, path := range l.config.Config.ReadonlyPaths {
		if err := remountReadonly(path); err != nil {
			return err
		}
	}
	for _, path := range l.config.Config.MaskPaths {
		if err := maskPath(path); err != nil {
			return err
		}
	}
	pdeath, err := system.GetParentDeathSignal()
	if err != nil {
		return err
	}
	if l.config.NoNewPrivileges {
		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
			return err
		}
	}
	// Tell our parent that we're ready to Execv. This must be done before the
	// Seccomp rules have been applied, because we need to be able to read and
	// write to a socket.
	if err := syncParentReady(l.pipe); err != nil {
		return err
	}
	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
	// do this before dropping capabilities; otherwise do it as late as possible
	// just before execve so as few syscalls take place after it as possible.
	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return err
		}
	}
	if err := finalizeNamespace(l.config); err != nil {
		return err
	}
	// finalizeNamespace can change user/group which clears the parent death
	// signal, so we restore it here.
	if err := pdeath.Restore(); err != nil {
		return err
	}
	// compare the parent from the inital start of the init process and make sure that it did not change.
	// if the parent changes that means it died and we were reparented to something else so we should
	// just kill ourself and not cause problems for someone else.
	if syscall.Getppid() != l.parentPid {
		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
	}
	// check for the arg before waiting to make sure it exists and it is returned
	// as a create time error.
	name, err := exec.LookPath(l.config.Args[0])
	if err != nil {
		return err
	}
	// close the pipe to signal that we have completed our init.
	//***关闭pipe，此时，parent process和该process的交互完成***//
	l.pipe.Close()
	// wait for the fifo to be opened on the other side before
	// exec'ing the users process.
	//***此步会阻塞，直到有进程open exec.fifo***//
	fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
	if err != nil {
		return newSystemErrorWithCause(err, "openat exec fifo")
	}
	//***往exec.fifo中写入0***//
	if _, err := syscall.Write(fd, []byte("0")); err != nil {
		return newSystemErrorWithCause(err, "write 0 exec fifo")
	}
	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return newSystemErrorWithCause(err, "init seccomp")
		}
	}
	//***name:  /bin/date***//
	//***l.config.Args[0:]:  [/bin/date]***//
	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
		return newSystemErrorWithCause(err, "exec user process")
	}
	return nil
}

Init()方法会开始准备执行环境，并与parent进程进行交互。之后会主动关闭pipe结束交互。接着在syscall.Openat(exec.fifo)处阻塞。如果阻塞消除，则Init()往exec.fifo写”0” 。最后，神奇的事情发生了，在调用syscall.Exec()后，把用户的命令替换init命令，child进程完成了华丽丽的转变。容器启动完毕。
所以，这里关键的是阻塞，阻塞的消除由runc start完成。

runc start

runc start的流程定义在/start.go中：

Action: func(context *cli.Context) error {
	container, err := getContainer(context)
	if err != nil {
		return err
	}
	status, err := container.Status()
	if err != nil {
		return err
	}
	switch status {
	case libcontainer.Created:
		return container.Exec()
	case libcontainer.Stopped:
		return fmt.Errorf("cannot start a container that has run and stopped")
	case libcontainer.Running:
		return fmt.Errorf("cannot start an already running container")
	default:
		return fmt.Errorf("cannot start a container in the %s state", status)
	}
}

可以看出，如果容器在Created状态，那么就调用container.Exec()。而容器的状态由parent进程在最后会设置为Created状态。

container.exec()

container.exec()定义在/libcontainer/container_linux.go中：

//***Exec()调用exec()***//
func (c *linuxContainer) Exec() error {
	c.m.Lock()
	defer c.m.Unlock()
	return c.exec()
}
func (c *linuxContainer) exec() error {
	//***path:  /run/runc/nginx/exec.fifo***//
	path := filepath.Join(c.root, execFifoFilename)
	f, err := os.OpenFile(path, os.O_RDONLY, 0)
	if err != nil {
		return newSystemErrorWithCause(err, "open exec fifo for reading")
	}
	defer f.Close()
	//***data:  0***//
	//***只要把exec.fifo中的数据取出，则create()中的进程开始执行***//
	data, err := ioutil.ReadAll(f)
	if err != nil {
		return err
	}
	if len(data) > 0 {
		os.Remove(path)
		return nil
	}
	return fmt.Errorf("cannot start an already running container")
}

Exec()的流程很简单，打开exec.fifo，让child进程住下执行。

总结

本次分析介绍了run create和runc start的流程，其中很多细节并未详细介绍，如namespace的建立，cgroup的处理等，这些将会后续详细分析。
create和start流程分析完毕。