diff options
Diffstat (limited to 'libcontainerd')
27 files changed, 3586 insertions, 0 deletions
diff --git a/libcontainerd/client.go b/libcontainerd/client.go new file mode 100644 index 0000000..c9004b8 --- /dev/null +++ b/libcontainerd/client.go @@ -0,0 +1,46 @@ +package libcontainerd + +import ( + "fmt" + "sync" + + "github.com/docker/docker/pkg/locker" +) + +// clientCommon contains the platform agnostic fields used in the client structure +type clientCommon struct { + backend Backend + containers map[string]*container + locker *locker.Locker + mapMutex sync.RWMutex // protects read/write operations from containers map +} + +func (clnt *client) lock(containerID string) { + clnt.locker.Lock(containerID) +} + +func (clnt *client) unlock(containerID string) { + clnt.locker.Unlock(containerID) +} + +// must hold a lock for cont.containerID +func (clnt *client) appendContainer(cont *container) { + clnt.mapMutex.Lock() + clnt.containers[cont.containerID] = cont + clnt.mapMutex.Unlock() +} +func (clnt *client) deleteContainer(containerID string) { + clnt.mapMutex.Lock() + delete(clnt.containers, containerID) + clnt.mapMutex.Unlock() +} + +func (clnt *client) getContainer(containerID string) (*container, error) { + clnt.mapMutex.RLock() + container, ok := clnt.containers[containerID] + defer clnt.mapMutex.RUnlock() + if !ok { + return nil, fmt.Errorf("invalid container: %s", containerID) // fixme: typed error + } + return container, nil +} diff --git a/libcontainerd/client_linux.go b/libcontainerd/client_linux.go new file mode 100644 index 0000000..a6986b5 --- /dev/null +++ b/libcontainerd/client_linux.go @@ -0,0 +1,619 @@ +package libcontainerd + +import ( + "fmt" + "os" + "strings" + "sync" + "syscall" + "time" + + "github.com/Sirupsen/logrus" + containerd "github.com/containerd/containerd/api/grpc/types" + containerd_runtime_types "github.com/containerd/containerd/runtime" + "github.com/docker/docker/pkg/ioutils" + "github.com/docker/docker/pkg/mount" + "github.com/golang/protobuf/ptypes" + "github.com/golang/protobuf/ptypes/timestamp" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/net/context" +) + +type client struct { + clientCommon + + // Platform specific properties below here. + remote *remote + q queue + exitNotifiers map[string]*exitNotifier + liveRestore bool +} + +// GetServerVersion returns the connected server version information +func (clnt *client) GetServerVersion(ctx context.Context) (*ServerVersion, error) { + resp, err := clnt.remote.apiClient.GetServerVersion(ctx, &containerd.GetServerVersionRequest{}) + if err != nil { + return nil, err + } + + sv := &ServerVersion{ + GetServerVersionResponse: *resp, + } + + return sv, nil +} + +// AddProcess is the handler for adding a process to an already running +// container. It's called through docker exec. It returns the system pid of the +// exec'd process. +func (clnt *client) AddProcess(ctx context.Context, containerID, processFriendlyName string, specp Process, attachStdio StdioCallback) (pid int, err error) { + clnt.lock(containerID) + defer clnt.unlock(containerID) + container, err := clnt.getContainer(containerID) + if err != nil { + return -1, err + } + + spec, err := container.spec() + if err != nil { + return -1, err + } + sp := spec.Process + sp.Args = specp.Args + sp.Terminal = specp.Terminal + if len(specp.Env) > 0 { + sp.Env = specp.Env + } + if specp.Cwd != nil { + sp.Cwd = *specp.Cwd + } + if specp.User != nil { + sp.User = specs.User{ + UID: specp.User.UID, + GID: specp.User.GID, + AdditionalGids: specp.User.AdditionalGids, + } + } + if specp.Capabilities != nil { + sp.Capabilities.Bounding = specp.Capabilities + sp.Capabilities.Effective = specp.Capabilities + sp.Capabilities.Inheritable = specp.Capabilities + sp.Capabilities.Permitted = specp.Capabilities + } + + p := container.newProcess(processFriendlyName) + + r := &containerd.AddProcessRequest{ + Args: sp.Args, + Cwd: sp.Cwd, + Terminal: sp.Terminal, + Id: containerID, + Env: sp.Env, + User: &containerd.User{ + Uid: sp.User.UID, + Gid: sp.User.GID, + AdditionalGids: sp.User.AdditionalGids, + }, + Pid: processFriendlyName, + Stdin: p.fifo(syscall.Stdin), + Stdout: p.fifo(syscall.Stdout), + Stderr: p.fifo(syscall.Stderr), + Capabilities: sp.Capabilities.Effective, + ApparmorProfile: sp.ApparmorProfile, + SelinuxLabel: sp.SelinuxLabel, + NoNewPrivileges: sp.NoNewPrivileges, + Rlimits: convertRlimits(sp.Rlimits), + } + + fifoCtx, cancel := context.WithCancel(context.Background()) + defer func() { + if err != nil { + cancel() + } + }() + + iopipe, err := p.openFifos(fifoCtx, sp.Terminal) + if err != nil { + return -1, err + } + + resp, err := clnt.remote.apiClient.AddProcess(ctx, r) + if err != nil { + p.closeFifos(iopipe) + return -1, err + } + + var stdinOnce sync.Once + stdin := iopipe.Stdin + iopipe.Stdin = ioutils.NewWriteCloserWrapper(stdin, func() error { + var err error + stdinOnce.Do(func() { // on error from attach we don't know if stdin was already closed + err = stdin.Close() + if err2 := p.sendCloseStdin(); err == nil { + err = err2 + } + }) + return err + }) + + container.processes[processFriendlyName] = p + + if err := attachStdio(*iopipe); err != nil { + p.closeFifos(iopipe) + return -1, err + } + + return int(resp.SystemPid), nil +} + +func (clnt *client) SignalProcess(containerID string, pid string, sig int) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + _, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{ + Id: containerID, + Pid: pid, + Signal: uint32(sig), + }) + return err +} + +func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + if _, err := clnt.getContainer(containerID); err != nil { + return err + } + _, err := clnt.remote.apiClient.UpdateProcess(context.Background(), &containerd.UpdateProcessRequest{ + Id: containerID, + Pid: processFriendlyName, + Width: uint32(width), + Height: uint32(height), + }) + return err +} + +func (clnt *client) Pause(containerID string) error { + return clnt.setState(containerID, StatePause) +} + +func (clnt *client) setState(containerID, state string) error { + clnt.lock(containerID) + container, err := clnt.getContainer(containerID) + if err != nil { + clnt.unlock(containerID) + return err + } + if container.systemPid == 0 { + clnt.unlock(containerID) + return fmt.Errorf("No active process for container %s", containerID) + } + st := "running" + if state == StatePause { + st = "paused" + } + chstate := make(chan struct{}) + _, err = clnt.remote.apiClient.UpdateContainer(context.Background(), &containerd.UpdateContainerRequest{ + Id: containerID, + Pid: InitFriendlyName, + Status: st, + }) + if err != nil { + clnt.unlock(containerID) + return err + } + container.pauseMonitor.append(state, chstate) + clnt.unlock(containerID) + <-chstate + return nil +} + +func (clnt *client) Resume(containerID string) error { + return clnt.setState(containerID, StateResume) +} + +func (clnt *client) Stats(containerID string) (*Stats, error) { + resp, err := clnt.remote.apiClient.Stats(context.Background(), &containerd.StatsRequest{containerID}) + if err != nil { + return nil, err + } + return (*Stats)(resp), nil +} + +// Take care of the old 1.11.0 behavior in case the version upgrade +// happened without a clean daemon shutdown +func (clnt *client) cleanupOldRootfs(containerID string) { + // Unmount and delete the bundle folder + if mts, err := mount.GetMounts(); err == nil { + for _, mts := range mts { + if strings.HasSuffix(mts.Mountpoint, containerID+"/rootfs") { + if err := syscall.Unmount(mts.Mountpoint, syscall.MNT_DETACH); err == nil { + os.RemoveAll(strings.TrimSuffix(mts.Mountpoint, "/rootfs")) + } + break + } + } + } +} + +func (clnt *client) setExited(containerID string, exitCode uint32) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + + err := clnt.backend.StateChanged(containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateExit, + ExitCode: exitCode, + }}) + + clnt.cleanupOldRootfs(containerID) + + return err +} + +func (clnt *client) GetPidsForContainer(containerID string) ([]int, error) { + cont, err := clnt.getContainerdContainer(containerID) + if err != nil { + return nil, err + } + pids := make([]int, len(cont.Pids)) + for i, p := range cont.Pids { + pids[i] = int(p) + } + return pids, nil +} + +// Summary returns a summary of the processes running in a container. +// This is a no-op on Linux. +func (clnt *client) Summary(containerID string) ([]Summary, error) { + return nil, nil +} + +func (clnt *client) getContainerdContainer(containerID string) (*containerd.Container, error) { + resp, err := clnt.remote.apiClient.State(context.Background(), &containerd.StateRequest{Id: containerID}) + if err != nil { + return nil, err + } + for _, cont := range resp.Containers { + if cont.Id == containerID { + return cont, nil + } + } + return nil, fmt.Errorf("invalid state response") +} + +func (clnt *client) UpdateResources(containerID string, resources Resources) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + container, err := clnt.getContainer(containerID) + if err != nil { + return err + } + if container.systemPid == 0 { + return fmt.Errorf("No active process for container %s", containerID) + } + _, err = clnt.remote.apiClient.UpdateContainer(context.Background(), &containerd.UpdateContainerRequest{ + Id: containerID, + Pid: InitFriendlyName, + Resources: (*containerd.UpdateResource)(&resources), + }) + if err != nil { + return err + } + return nil +} + +func (clnt *client) getExitNotifier(containerID string) *exitNotifier { + clnt.mapMutex.RLock() + defer clnt.mapMutex.RUnlock() + return clnt.exitNotifiers[containerID] +} + +func (clnt *client) getOrCreateExitNotifier(containerID string) *exitNotifier { + clnt.mapMutex.Lock() + w, ok := clnt.exitNotifiers[containerID] + defer clnt.mapMutex.Unlock() + if !ok { + w = &exitNotifier{c: make(chan struct{}), client: clnt} + clnt.exitNotifiers[containerID] = w + } + return w +} + +func (clnt *client) restore(cont *containerd.Container, lastEvent *containerd.Event, attachStdio StdioCallback, options ...CreateOption) (err error) { + clnt.lock(cont.Id) + defer clnt.unlock(cont.Id) + + logrus.Debugf("libcontainerd: restore container %s state %s", cont.Id, cont.Status) + + containerID := cont.Id + if _, err := clnt.getContainer(containerID); err == nil { + return fmt.Errorf("container %s is already active", containerID) + } + + defer func() { + if err != nil { + clnt.deleteContainer(cont.Id) + } + }() + + container := clnt.newContainer(cont.BundlePath, options...) + container.systemPid = systemPid(cont) + + var terminal bool + for _, p := range cont.Processes { + if p.Pid == InitFriendlyName { + terminal = p.Terminal + } + } + + fifoCtx, cancel := context.WithCancel(context.Background()) + defer func() { + if err != nil { + cancel() + } + }() + + iopipe, err := container.openFifos(fifoCtx, terminal) + if err != nil { + return err + } + var stdinOnce sync.Once + stdin := iopipe.Stdin + iopipe.Stdin = ioutils.NewWriteCloserWrapper(stdin, func() error { + var err error + stdinOnce.Do(func() { // on error from attach we don't know if stdin was already closed + err = stdin.Close() + }) + return err + }) + + if err := attachStdio(*iopipe); err != nil { + container.closeFifos(iopipe) + return err + } + + clnt.appendContainer(container) + + err = clnt.backend.StateChanged(containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateRestore, + Pid: container.systemPid, + }}) + + if err != nil { + container.closeFifos(iopipe) + return err + } + + if lastEvent != nil { + // This should only be a pause or resume event + if lastEvent.Type == StatePause || lastEvent.Type == StateResume { + return clnt.backend.StateChanged(containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: lastEvent.Type, + Pid: container.systemPid, + }}) + } + + logrus.Warnf("libcontainerd: unexpected backlog event: %#v", lastEvent) + } + + return nil +} + +func (clnt *client) getContainerLastEventSinceTime(id string, tsp *timestamp.Timestamp) (*containerd.Event, error) { + er := &containerd.EventsRequest{ + Timestamp: tsp, + StoredOnly: true, + Id: id, + } + events, err := clnt.remote.apiClient.Events(context.Background(), er) + if err != nil { + logrus.Errorf("libcontainerd: failed to get container events stream for %s: %q", er.Id, err) + return nil, err + } + + var ev *containerd.Event + for { + e, err := events.Recv() + if err != nil { + if err.Error() == "EOF" { + break + } + logrus.Errorf("libcontainerd: failed to get container event for %s: %q", id, err) + return nil, err + } + ev = e + logrus.Debugf("libcontainerd: received past event %#v", ev) + } + + return ev, nil +} + +func (clnt *client) getContainerLastEvent(id string) (*containerd.Event, error) { + ev, err := clnt.getContainerLastEventSinceTime(id, clnt.remote.restoreFromTimestamp) + if err == nil && ev == nil { + // If ev is nil and the container is running in containerd, + // we already consumed all the event of the + // container, included the "exit" one. + // Thus, we request all events containerd has in memory for + // this container in order to get the last one (which should + // be an exit event) + logrus.Warnf("libcontainerd: client is out of sync, restore was called on a fully synced container (%s).", id) + // Request all events since beginning of time + t := time.Unix(0, 0) + tsp, err := ptypes.TimestampProto(t) + if err != nil { + logrus.Errorf("libcontainerd: getLastEventSinceTime() failed to convert timestamp: %q", err) + return nil, err + } + + return clnt.getContainerLastEventSinceTime(id, tsp) + } + + return ev, err +} + +func (clnt *client) Restore(containerID string, attachStdio StdioCallback, options ...CreateOption) error { + // Synchronize with live events + clnt.remote.Lock() + defer clnt.remote.Unlock() + // Check that containerd still knows this container. + // + // In the unlikely event that Restore for this container process + // the its past event before the main loop, the event will be + // processed twice. However, this is not an issue as all those + // events will do is change the state of the container to be + // exactly the same. + cont, err := clnt.getContainerdContainer(containerID) + // Get its last event + ev, eerr := clnt.getContainerLastEvent(containerID) + if err != nil || containerd_runtime_types.State(cont.Status) == containerd_runtime_types.Stopped { + if err != nil { + logrus.Warnf("libcontainerd: failed to retrieve container %s state: %v", containerID, err) + } + if ev != nil && (ev.Pid != InitFriendlyName || ev.Type != StateExit) { + // Wait a while for the exit event + timeout := time.NewTimer(10 * time.Second) + tick := time.NewTicker(100 * time.Millisecond) + stop: + for { + select { + case <-timeout.C: + break stop + case <-tick.C: + ev, eerr = clnt.getContainerLastEvent(containerID) + if eerr != nil { + break stop + } + if ev != nil && ev.Pid == InitFriendlyName && ev.Type == StateExit { + break stop + } + } + } + timeout.Stop() + tick.Stop() + } + + // get the exit status for this container, if we don't have + // one, indicate an error + ec := uint32(255) + if eerr == nil && ev != nil && ev.Pid == InitFriendlyName && ev.Type == StateExit { + ec = ev.Status + } + clnt.setExited(containerID, ec) + + return nil + } + + // container is still alive + if clnt.liveRestore { + if err := clnt.restore(cont, ev, attachStdio, options...); err != nil { + logrus.Errorf("libcontainerd: error restoring %s: %v", containerID, err) + } + return nil + } + + // Kill the container if liveRestore == false + w := clnt.getOrCreateExitNotifier(containerID) + clnt.lock(cont.Id) + container := clnt.newContainer(cont.BundlePath) + container.systemPid = systemPid(cont) + clnt.appendContainer(container) + clnt.unlock(cont.Id) + + container.discardFifos() + + if err := clnt.Signal(containerID, int(syscall.SIGTERM)); err != nil { + logrus.Errorf("libcontainerd: error sending sigterm to %v: %v", containerID, err) + } + + // Let the main loop handle the exit event + clnt.remote.Unlock() + + if ev != nil && ev.Type == StatePause { + // resume container, it depends on the main loop, so we do it after Unlock() + logrus.Debugf("libcontainerd: %s was paused, resuming it so it can die", containerID) + if err := clnt.Resume(containerID); err != nil { + return fmt.Errorf("failed to resume container: %v", err) + } + } + + select { + case <-time.After(10 * time.Second): + if err := clnt.Signal(containerID, int(syscall.SIGKILL)); err != nil { + logrus.Errorf("libcontainerd: error sending sigkill to %v: %v", containerID, err) + } + select { + case <-time.After(2 * time.Second): + case <-w.wait(): + // relock because of the defer + clnt.remote.Lock() + return nil + } + case <-w.wait(): + // relock because of the defer + clnt.remote.Lock() + return nil + } + // relock because of the defer + clnt.remote.Lock() + + clnt.deleteContainer(containerID) + + return clnt.setExited(containerID, uint32(255)) +} + +func (clnt *client) CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + if _, err := clnt.getContainer(containerID); err != nil { + return err + } + + _, err := clnt.remote.apiClient.CreateCheckpoint(context.Background(), &containerd.CreateCheckpointRequest{ + Id: containerID, + Checkpoint: &containerd.Checkpoint{ + Name: checkpointID, + Exit: exit, + Tcp: true, + UnixSockets: true, + Shell: false, + EmptyNS: []string{"network"}, + }, + CheckpointDir: checkpointDir, + }) + return err +} + +func (clnt *client) DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + if _, err := clnt.getContainer(containerID); err != nil { + return err + } + + _, err := clnt.remote.apiClient.DeleteCheckpoint(context.Background(), &containerd.DeleteCheckpointRequest{ + Id: containerID, + Name: checkpointID, + CheckpointDir: checkpointDir, + }) + return err +} + +func (clnt *client) ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error) { + clnt.lock(containerID) + defer clnt.unlock(containerID) + if _, err := clnt.getContainer(containerID); err != nil { + return nil, err + } + + resp, err := clnt.remote.apiClient.ListCheckpoint(context.Background(), &containerd.ListCheckpointRequest{ + Id: containerID, + CheckpointDir: checkpointDir, + }) + if err != nil { + return nil, err + } + return (*Checkpoints)(resp), nil +} diff --git a/libcontainerd/client_solaris.go b/libcontainerd/client_solaris.go new file mode 100644 index 0000000..cb93997 --- /dev/null +++ b/libcontainerd/client_solaris.go @@ -0,0 +1,101 @@ +package libcontainerd + +import "golang.org/x/net/context" + +type client struct { + clientCommon + + // Platform specific properties below here. + remote *remote + q queue + exitNotifiers map[string]*exitNotifier + liveRestore bool +} + +// GetServerVersion returns the connected server version information +func (clnt *client) GetServerVersion(ctx context.Context) (*ServerVersion, error) { + resp, err := clnt.remote.apiClient.GetServerVersion(ctx, &containerd.GetServerVersionRequest{}) + if err != nil { + return nil, err + } + + sv := &ServerVersion{ + GetServerVersionResponse: *resp, + } + + return sv, nil +} + +func (clnt *client) AddProcess(ctx context.Context, containerID, processFriendlyName string, specp Process, attachStdio StdioCallback) (int, error) { + return -1, nil +} + +func (clnt *client) SignalProcess(containerID string, pid string, sig int) error { + return nil +} + +func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error { + return nil +} + +func (clnt *client) Pause(containerID string) error { + return nil +} + +func (clnt *client) Resume(containerID string) error { + return nil +} + +func (clnt *client) Stats(containerID string) (*Stats, error) { + return nil, nil +} + +func (clnt *client) getExitNotifier(containerID string) *exitNotifier { + clnt.mapMutex.RLock() + defer clnt.mapMutex.RUnlock() + return clnt.exitNotifiers[containerID] +} + +func (clnt *client) getOrCreateExitNotifier(containerID string) *exitNotifier { + clnt.mapMutex.Lock() + defer clnt.mapMutex.Unlock() + w, ok := clnt.exitNotifiers[containerID] + if !ok { + w = &exitNotifier{c: make(chan struct{}), client: clnt} + clnt.exitNotifiers[containerID] = w + } + return w +} + +// Restore is the handler for restoring a container +func (clnt *client) Restore(containerID string, attachStdio StdioCallback, options ...CreateOption) error { + return nil +} + +func (clnt *client) GetPidsForContainer(containerID string) ([]int, error) { + return nil, nil +} + +// Summary returns a summary of the processes running in a container. +func (clnt *client) Summary(containerID string) ([]Summary, error) { + return nil, nil +} + +// UpdateResources updates resources for a running container. +func (clnt *client) UpdateResources(containerID string, resources Resources) error { + // Updating resource isn't supported on Solaris + // but we should return nil for enabling updating container + return nil +} + +func (clnt *client) CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error { + return nil +} + +func (clnt *client) DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error { + return nil +} + +func (clnt *client) ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error) { + return nil, nil +} diff --git a/libcontainerd/client_unix.go b/libcontainerd/client_unix.go new file mode 100644 index 0000000..6dbf3af --- /dev/null +++ b/libcontainerd/client_unix.go @@ -0,0 +1,141 @@ +// +build linux solaris + +package libcontainerd + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/Sirupsen/logrus" + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/docker/docker/pkg/idtools" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/net/context" +) + +func (clnt *client) prepareBundleDir(uid, gid int) (string, error) { + root, err := filepath.Abs(clnt.remote.stateDir) + if err != nil { + return "", err + } + if uid == 0 && gid == 0 { + return root, nil + } + p := string(filepath.Separator) + for _, d := range strings.Split(root, string(filepath.Separator))[1:] { + p = filepath.Join(p, d) + fi, err := os.Stat(p) + if err != nil && !os.IsNotExist(err) { + return "", err + } + if os.IsNotExist(err) || fi.Mode()&1 == 0 { + p = fmt.Sprintf("%s.%d.%d", p, uid, gid) + if err := idtools.MkdirAndChown(p, 0700, idtools.IDPair{uid, gid}); err != nil && !os.IsExist(err) { + return "", err + } + } + } + return p, nil +} + +func (clnt *client) Create(containerID string, checkpoint string, checkpointDir string, spec specs.Spec, attachStdio StdioCallback, options ...CreateOption) (err error) { + clnt.lock(containerID) + defer clnt.unlock(containerID) + + if _, err := clnt.getContainer(containerID); err == nil { + return fmt.Errorf("Container %s is already active", containerID) + } + + uid, gid, err := getRootIDs(specs.Spec(spec)) + if err != nil { + return err + } + dir, err := clnt.prepareBundleDir(uid, gid) + if err != nil { + return err + } + + container := clnt.newContainer(filepath.Join(dir, containerID), options...) + if err := container.clean(); err != nil { + return err + } + + defer func() { + if err != nil { + container.clean() + clnt.deleteContainer(containerID) + } + }() + + if err := idtools.MkdirAllAndChown(container.dir, 0700, idtools.IDPair{uid, gid}); err != nil && !os.IsExist(err) { + return err + } + + f, err := os.Create(filepath.Join(container.dir, configFilename)) + if err != nil { + return err + } + defer f.Close() + if err := json.NewEncoder(f).Encode(spec); err != nil { + return err + } + return container.start(&spec, checkpoint, checkpointDir, attachStdio) +} + +func (clnt *client) Signal(containerID string, sig int) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + _, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{ + Id: containerID, + Pid: InitFriendlyName, + Signal: uint32(sig), + }) + return err +} + +func (clnt *client) newContainer(dir string, options ...CreateOption) *container { + container := &container{ + containerCommon: containerCommon{ + process: process{ + dir: dir, + processCommon: processCommon{ + containerID: filepath.Base(dir), + client: clnt, + friendlyName: InitFriendlyName, + }, + }, + processes: make(map[string]*process), + }, + } + for _, option := range options { + if err := option.Apply(container); err != nil { + logrus.Errorf("libcontainerd: newContainer(): %v", err) + } + } + return container +} + +type exitNotifier struct { + id string + client *client + c chan struct{} + once sync.Once +} + +func (en *exitNotifier) close() { + en.once.Do(func() { + close(en.c) + en.client.mapMutex.Lock() + if en == en.client.exitNotifiers[en.id] { + delete(en.client.exitNotifiers, en.id) + } + en.client.mapMutex.Unlock() + }) +} +func (en *exitNotifier) wait() <-chan struct{} { + return en.c +} diff --git a/libcontainerd/client_windows.go b/libcontainerd/client_windows.go new file mode 100644 index 0000000..455e8e5 --- /dev/null +++ b/libcontainerd/client_windows.go @@ -0,0 +1,754 @@ +package libcontainerd + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strings" + "syscall" + "time" + + "golang.org/x/net/context" + + "github.com/Microsoft/hcsshim" + "github.com/Sirupsen/logrus" + "github.com/docker/docker/pkg/sysinfo" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type client struct { + clientCommon + + // Platform specific properties below here (none presently on Windows) +} + +// Win32 error codes that are used for various workarounds +// These really should be ALL_CAPS to match golangs syscall library and standard +// Win32 error conventions, but golint insists on CamelCase. +const ( + CoEClassstring = syscall.Errno(0x800401F3) // Invalid class string + ErrorNoNetwork = syscall.Errno(1222) // The network is not present or not started + ErrorBadPathname = syscall.Errno(161) // The specified path is invalid + ErrorInvalidObject = syscall.Errno(0x800710D8) // The object identifier does not represent a valid object +) + +// defaultOwner is a tag passed to HCS to allow it to differentiate between +// container creator management stacks. We hard code "docker" in the case +// of docker. +const defaultOwner = "docker" + +// Create is the entrypoint to create a container from a spec, and if successfully +// created, start it too. Table below shows the fields required for HCS JSON calling parameters, +// where if not populated, is omitted. +// +-----------------+--------------------------------------------+---------------------------------------------------+ +// | | Isolation=Process | Isolation=Hyper-V | +// +-----------------+--------------------------------------------+---------------------------------------------------+ +// | VolumePath | \\?\\Volume{GUIDa} | | +// | LayerFolderPath | %root%\windowsfilter\containerID | %root%\windowsfilter\containerID (servicing only) | +// | Layers[] | ID=GUIDb;Path=%root%\windowsfilter\layerID | ID=GUIDb;Path=%root%\windowsfilter\layerID | +// | HvRuntime | | ImagePath=%root%\BaseLayerID\UtilityVM | +// +-----------------+--------------------------------------------+---------------------------------------------------+ +// +// Isolation=Process example: +// +// { +// "SystemType": "Container", +// "Name": "5e0055c814a6005b8e57ac59f9a522066e0af12b48b3c26a9416e23907698776", +// "Owner": "docker", +// "VolumePath": "\\\\\\\\?\\\\Volume{66d1ef4c-7a00-11e6-8948-00155ddbef9d}", +// "IgnoreFlushesDuringBoot": true, +// "LayerFolderPath": "C:\\\\control\\\\windowsfilter\\\\5e0055c814a6005b8e57ac59f9a522066e0af12b48b3c26a9416e23907698776", +// "Layers": [{ +// "ID": "18955d65-d45a-557b-bf1c-49d6dfefc526", +// "Path": "C:\\\\control\\\\windowsfilter\\\\65bf96e5760a09edf1790cb229e2dfb2dbd0fcdc0bf7451bae099106bfbfea0c" +// }], +// "HostName": "5e0055c814a6", +// "MappedDirectories": [], +// "HvPartition": false, +// "EndpointList": ["eef2649d-bb17-4d53-9937-295a8efe6f2c"], +// "Servicing": false +//} +// +// Isolation=Hyper-V example: +// +//{ +// "SystemType": "Container", +// "Name": "475c2c58933b72687a88a441e7e0ca4bd72d76413c5f9d5031fee83b98f6045d", +// "Owner": "docker", +// "IgnoreFlushesDuringBoot": true, +// "Layers": [{ +// "ID": "18955d65-d45a-557b-bf1c-49d6dfefc526", +// "Path": "C:\\\\control\\\\windowsfilter\\\\65bf96e5760a09edf1790cb229e2dfb2dbd0fcdc0bf7451bae099106bfbfea0c" +// }], +// "HostName": "475c2c58933b", +// "MappedDirectories": [], +// "HvPartition": true, +// "EndpointList": ["e1bb1e61-d56f-405e-b75d-fd520cefa0cb"], +// "DNSSearchList": "a.com,b.com,c.com", +// "HvRuntime": { +// "ImagePath": "C:\\\\control\\\\windowsfilter\\\\65bf96e5760a09edf1790cb229e2dfb2dbd0fcdc0bf7451bae099106bfbfea0c\\\\UtilityVM" +// }, +// "Servicing": false +//} +func (clnt *client) Create(containerID string, checkpoint string, checkpointDir string, spec specs.Spec, attachStdio StdioCallback, options ...CreateOption) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + if b, err := json.Marshal(spec); err == nil { + logrus.Debugln("libcontainerd: client.Create() with spec", string(b)) + } + osName := spec.Platform.OS + if osName == "windows" { + return clnt.createWindows(containerID, checkpoint, checkpointDir, spec, attachStdio, options...) + } + return clnt.createLinux(containerID, checkpoint, checkpointDir, spec, attachStdio, options...) +} + +func (clnt *client) createWindows(containerID string, checkpoint string, checkpointDir string, spec specs.Spec, attachStdio StdioCallback, options ...CreateOption) error { + configuration := &hcsshim.ContainerConfig{ + SystemType: "Container", + Name: containerID, + Owner: defaultOwner, + IgnoreFlushesDuringBoot: false, + HostName: spec.Hostname, + HvPartition: false, + } + + if spec.Windows.Resources != nil { + if spec.Windows.Resources.CPU != nil { + if spec.Windows.Resources.CPU.Count != nil { + // This check is being done here rather than in adaptContainerSettings + // because we don't want to update the HostConfig in case this container + // is moved to a host with more CPUs than this one. + cpuCount := *spec.Windows.Resources.CPU.Count + hostCPUCount := uint64(sysinfo.NumCPU()) + if cpuCount > hostCPUCount { + logrus.Warnf("Changing requested CPUCount of %d to current number of processors, %d", cpuCount, hostCPUCount) + cpuCount = hostCPUCount + } + configuration.ProcessorCount = uint32(cpuCount) + } + if spec.Windows.Resources.CPU.Shares != nil { + configuration.ProcessorWeight = uint64(*spec.Windows.Resources.CPU.Shares) + } + if spec.Windows.Resources.CPU.Maximum != nil { + configuration.ProcessorMaximum = int64(*spec.Windows.Resources.CPU.Maximum) + } + } + if spec.Windows.Resources.Memory != nil { + if spec.Windows.Resources.Memory.Limit != nil { + configuration.MemoryMaximumInMB = int64(*spec.Windows.Resources.Memory.Limit) / 1024 / 1024 + } + } + if spec.Windows.Resources.Storage != nil { + if spec.Windows.Resources.Storage.Bps != nil { + configuration.StorageBandwidthMaximum = *spec.Windows.Resources.Storage.Bps + } + if spec.Windows.Resources.Storage.Iops != nil { + configuration.StorageIOPSMaximum = *spec.Windows.Resources.Storage.Iops + } + } + } + + var layerOpt *LayerOption + for _, option := range options { + if s, ok := option.(*ServicingOption); ok { + configuration.Servicing = s.IsServicing + continue + } + if f, ok := option.(*FlushOption); ok { + configuration.IgnoreFlushesDuringBoot = f.IgnoreFlushesDuringBoot + continue + } + if h, ok := option.(*HyperVIsolationOption); ok { + configuration.HvPartition = h.IsHyperV + continue + } + if l, ok := option.(*LayerOption); ok { + layerOpt = l + } + if n, ok := option.(*NetworkEndpointsOption); ok { + configuration.EndpointList = n.Endpoints + configuration.AllowUnqualifiedDNSQuery = n.AllowUnqualifiedDNSQuery + if n.DNSSearchList != nil { + configuration.DNSSearchList = strings.Join(n.DNSSearchList, ",") + } + configuration.NetworkSharedContainerName = n.NetworkSharedContainerID + continue + } + if c, ok := option.(*CredentialsOption); ok { + configuration.Credentials = c.Credentials + continue + } + } + + // We must have a layer option with at least one path + if layerOpt == nil || layerOpt.LayerPaths == nil { + return fmt.Errorf("no layer option or paths were supplied to the runtime") + } + + if configuration.HvPartition { + // Find the upper-most utility VM image, since the utility VM does not + // use layering in RS1. + // TODO @swernli/jhowardmsft at some point post RS1 this may be re-locatable. + var uvmImagePath string + for _, path := range layerOpt.LayerPaths { + fullPath := filepath.Join(path, "UtilityVM") + _, err := os.Stat(fullPath) + if err == nil { + uvmImagePath = fullPath + break + } + if !os.IsNotExist(err) { + return err + } + } + if uvmImagePath == "" { + return errors.New("utility VM image could not be found") + } + configuration.HvRuntime = &hcsshim.HvRuntime{ImagePath: uvmImagePath} + } else { + configuration.VolumePath = spec.Root.Path + } + + configuration.LayerFolderPath = layerOpt.LayerFolderPath + + for _, layerPath := range layerOpt.LayerPaths { + _, filename := filepath.Split(layerPath) + g, err := hcsshim.NameToGuid(filename) + if err != nil { + return err + } + configuration.Layers = append(configuration.Layers, hcsshim.Layer{ + ID: g.ToString(), + Path: layerPath, + }) + } + + // Add the mounts (volumes, bind mounts etc) to the structure + mds := make([]hcsshim.MappedDir, len(spec.Mounts)) + for i, mount := range spec.Mounts { + mds[i] = hcsshim.MappedDir{ + HostPath: mount.Source, + ContainerPath: mount.Destination, + ReadOnly: false, + } + for _, o := range mount.Options { + if strings.ToLower(o) == "ro" { + mds[i].ReadOnly = true + } + } + } + configuration.MappedDirectories = mds + + hcsContainer, err := hcsshim.CreateContainer(containerID, configuration) + if err != nil { + return err + } + + // Construct a container object for calling start on it. + container := &container{ + containerCommon: containerCommon{ + process: process{ + processCommon: processCommon{ + containerID: containerID, + client: clnt, + friendlyName: InitFriendlyName, + }, + }, + processes: make(map[string]*process), + }, + ociSpec: spec, + hcsContainer: hcsContainer, + } + + container.options = options + for _, option := range options { + if err := option.Apply(container); err != nil { + logrus.Errorf("libcontainerd: %v", err) + } + } + + // Call start, and if it fails, delete the container from our + // internal structure, start will keep HCS in sync by deleting the + // container there. + logrus.Debugf("libcontainerd: createWindows() id=%s, Calling start()", containerID) + if err := container.start(attachStdio); err != nil { + clnt.deleteContainer(containerID) + return err + } + + logrus.Debugf("libcontainerd: createWindows() id=%s completed successfully", containerID) + return nil + +} + +func (clnt *client) createLinux(containerID string, checkpoint string, checkpointDir string, spec specs.Spec, attachStdio StdioCallback, options ...CreateOption) error { + logrus.Debugf("libcontainerd: createLinux(): containerId %s ", containerID) + + // TODO @jhowardmsft LCOW Support: This needs to be configurable, not hard-coded. + // However, good-enough for the LCOW bring-up. + configuration := &hcsshim.ContainerConfig{ + HvPartition: true, + Name: containerID, + SystemType: "container", + ContainerType: "linux", + Owner: defaultOwner, + TerminateOnLastHandleClosed: true, + HvRuntime: &hcsshim.HvRuntime{ + ImagePath: `c:\Program Files\Linux Containers`, + LinuxKernelFile: `bootx64.efi`, + LinuxInitrdFile: `initrd.img`, + }, + } + + var layerOpt *LayerOption + for _, option := range options { + if l, ok := option.(*LayerOption); ok { + layerOpt = l + } + } + + // We must have a layer option with at least one path + if layerOpt == nil || layerOpt.LayerPaths == nil { + return fmt.Errorf("no layer option or paths were supplied to the runtime") + } + + // LayerFolderPath (writeable layer) + Layers (Guid + path) + configuration.LayerFolderPath = layerOpt.LayerFolderPath + for _, layerPath := range layerOpt.LayerPaths { + _, filename := filepath.Split(layerPath) + g, err := hcsshim.NameToGuid(filename) + if err != nil { + return err + } + configuration.Layers = append(configuration.Layers, hcsshim.Layer{ + ID: g.ToString(), + Path: filepath.Join(layerPath, "layer.vhd"), + }) + } + + for _, option := range options { + if n, ok := option.(*NetworkEndpointsOption); ok { + configuration.EndpointList = n.Endpoints + configuration.AllowUnqualifiedDNSQuery = n.AllowUnqualifiedDNSQuery + if n.DNSSearchList != nil { + configuration.DNSSearchList = strings.Join(n.DNSSearchList, ",") + } + configuration.NetworkSharedContainerName = n.NetworkSharedContainerID + break + } + } + + hcsContainer, err := hcsshim.CreateContainer(containerID, configuration) + if err != nil { + return err + } + + // Construct a container object for calling start on it. + container := &container{ + containerCommon: containerCommon{ + process: process{ + processCommon: processCommon{ + containerID: containerID, + client: clnt, + friendlyName: InitFriendlyName, + }, + }, + processes: make(map[string]*process), + }, + ociSpec: spec, + hcsContainer: hcsContainer, + } + + container.options = options + for _, option := range options { + if err := option.Apply(container); err != nil { + logrus.Errorf("libcontainerd: createLinux() %v", err) + } + } + + // Call start, and if it fails, delete the container from our + // internal structure, start will keep HCS in sync by deleting the + // container there. + logrus.Debugf("libcontainerd: createLinux() id=%s, Calling start()", containerID) + if err := container.start(attachStdio); err != nil { + clnt.deleteContainer(containerID) + return err + } + + logrus.Debugf("libcontainerd: createLinux() id=%s completed successfully", containerID) + return nil +} + +// AddProcess is the handler for adding a process to an already running +// container. It's called through docker exec. It returns the system pid of the +// exec'd process. +func (clnt *client) AddProcess(ctx context.Context, containerID, processFriendlyName string, procToAdd Process, attachStdio StdioCallback) (int, error) { + clnt.lock(containerID) + defer clnt.unlock(containerID) + container, err := clnt.getContainer(containerID) + if err != nil { + return -1, err + } + // Note we always tell HCS to + // create stdout as it's required regardless of '-i' or '-t' options, so that + // docker can always grab the output through logs. We also tell HCS to always + // create stdin, even if it's not used - it will be closed shortly. Stderr + // is only created if it we're not -t. + createProcessParms := hcsshim.ProcessConfig{ + CreateStdInPipe: true, + CreateStdOutPipe: true, + CreateStdErrPipe: !procToAdd.Terminal, + } + if procToAdd.Terminal { + createProcessParms.EmulateConsole = true + createProcessParms.ConsoleSize[0] = uint(procToAdd.ConsoleSize.Height) + createProcessParms.ConsoleSize[1] = uint(procToAdd.ConsoleSize.Width) + } + + // Take working directory from the process to add if it is defined, + // otherwise take from the first process. + if procToAdd.Cwd != "" { + createProcessParms.WorkingDirectory = procToAdd.Cwd + } else { + createProcessParms.WorkingDirectory = container.ociSpec.Process.Cwd + } + + // Configure the environment for the process + createProcessParms.Environment = setupEnvironmentVariables(procToAdd.Env) + if container.ociSpec.Platform.OS == "windows" { + createProcessParms.CommandLine = strings.Join(procToAdd.Args, " ") + } else { + createProcessParms.CommandArgs = procToAdd.Args + } + createProcessParms.User = procToAdd.User.Username + + logrus.Debugf("libcontainerd: commandLine: %s", createProcessParms.CommandLine) + + // Start the command running in the container. + var stdout, stderr io.ReadCloser + var stdin io.WriteCloser + newProcess, err := container.hcsContainer.CreateProcess(&createProcessParms) + if err != nil { + logrus.Errorf("libcontainerd: AddProcess(%s) CreateProcess() failed %s", containerID, err) + return -1, err + } + + pid := newProcess.Pid() + + stdin, stdout, stderr, err = newProcess.Stdio() + if err != nil { + logrus.Errorf("libcontainerd: %s getting std pipes failed %s", containerID, err) + return -1, err + } + + iopipe := &IOPipe{Terminal: procToAdd.Terminal} + iopipe.Stdin = createStdInCloser(stdin, newProcess) + + // Convert io.ReadClosers to io.Readers + if stdout != nil { + iopipe.Stdout = ioutil.NopCloser(&autoClosingReader{ReadCloser: stdout}) + } + if stderr != nil { + iopipe.Stderr = ioutil.NopCloser(&autoClosingReader{ReadCloser: stderr}) + } + + proc := &process{ + processCommon: processCommon{ + containerID: containerID, + friendlyName: processFriendlyName, + client: clnt, + systemPid: uint32(pid), + }, + hcsProcess: newProcess, + } + + // Add the process to the container's list of processes + container.processes[processFriendlyName] = proc + + // Tell the engine to attach streams back to the client + if err := attachStdio(*iopipe); err != nil { + return -1, err + } + + // Spin up a go routine waiting for exit to handle cleanup + go container.waitExit(proc, false) + + return pid, nil +} + +// Signal handles `docker stop` on Windows. While Linux has support for +// the full range of signals, signals aren't really implemented on Windows. +// We fake supporting regular stop and -9 to force kill. +func (clnt *client) Signal(containerID string, sig int) error { + var ( + cont *container + err error + ) + + // Get the container as we need it to get the container handle. + clnt.lock(containerID) + defer clnt.unlock(containerID) + if cont, err = clnt.getContainer(containerID); err != nil { + return err + } + + cont.manualStopRequested = true + + logrus.Debugf("libcontainerd: Signal() containerID=%s sig=%d pid=%d", containerID, sig, cont.systemPid) + + if syscall.Signal(sig) == syscall.SIGKILL { + // Terminate the compute system + if err := cont.hcsContainer.Terminate(); err != nil { + if !hcsshim.IsPending(err) { + logrus.Errorf("libcontainerd: failed to terminate %s - %q", containerID, err) + } + } + } else { + // Shut down the container + if err := cont.hcsContainer.Shutdown(); err != nil { + if !hcsshim.IsPending(err) && !hcsshim.IsAlreadyStopped(err) { + // ignore errors + logrus.Warnf("libcontainerd: failed to shutdown container %s: %q", containerID, err) + } + } + } + + return nil +} + +// While Linux has support for the full range of signals, signals aren't really implemented on Windows. +// We try to terminate the specified process whatever signal is requested. +func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + cont, err := clnt.getContainer(containerID) + if err != nil { + return err + } + + for _, p := range cont.processes { + if p.friendlyName == processFriendlyName { + return p.hcsProcess.Kill() + } + } + + return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID) +} + +// Resize handles a CLI event to resize an interactive docker run or docker exec +// window. +func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error { + // Get the libcontainerd container object + clnt.lock(containerID) + defer clnt.unlock(containerID) + cont, err := clnt.getContainer(containerID) + if err != nil { + return err + } + + h, w := uint16(height), uint16(width) + + if processFriendlyName == InitFriendlyName { + logrus.Debugln("libcontainerd: resizing systemPID in", containerID, cont.process.systemPid) + return cont.process.hcsProcess.ResizeConsole(w, h) + } + + for _, p := range cont.processes { + if p.friendlyName == processFriendlyName { + logrus.Debugln("libcontainerd: resizing exec'd process", containerID, p.systemPid) + return p.hcsProcess.ResizeConsole(w, h) + } + } + + return fmt.Errorf("Resize could not find containerID %s to resize", containerID) + +} + +// Pause handles pause requests for containers +func (clnt *client) Pause(containerID string) error { + unlockContainer := true + // Get the libcontainerd container object + clnt.lock(containerID) + defer func() { + if unlockContainer { + clnt.unlock(containerID) + } + }() + container, err := clnt.getContainer(containerID) + if err != nil { + return err + } + + for _, option := range container.options { + if h, ok := option.(*HyperVIsolationOption); ok { + if !h.IsHyperV { + return errors.New("cannot pause Windows Server Containers") + } + break + } + } + + err = container.hcsContainer.Pause() + if err != nil { + return err + } + + // Unlock container before calling back into the daemon + unlockContainer = false + clnt.unlock(containerID) + + return clnt.backend.StateChanged(containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StatePause, + }}) +} + +// Resume handles resume requests for containers +func (clnt *client) Resume(containerID string) error { + unlockContainer := true + // Get the libcontainerd container object + clnt.lock(containerID) + defer func() { + if unlockContainer { + clnt.unlock(containerID) + } + }() + container, err := clnt.getContainer(containerID) + if err != nil { + return err + } + + // This should never happen, since Windows Server Containers cannot be paused + for _, option := range container.options { + if h, ok := option.(*HyperVIsolationOption); ok { + if !h.IsHyperV { + return errors.New("cannot resume Windows Server Containers") + } + break + } + } + + err = container.hcsContainer.Resume() + if err != nil { + return err + } + + // Unlock container before calling back into the daemon + unlockContainer = false + clnt.unlock(containerID) + + return clnt.backend.StateChanged(containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateResume, + }}) +} + +// Stats handles stats requests for containers +func (clnt *client) Stats(containerID string) (*Stats, error) { + // Get the libcontainerd container object + clnt.lock(containerID) + defer clnt.unlock(containerID) + container, err := clnt.getContainer(containerID) + if err != nil { + return nil, err + } + s, err := container.hcsContainer.Statistics() + if err != nil { + return nil, err + } + st := Stats(s) + return &st, nil +} + +// Restore is the handler for restoring a container +func (clnt *client) Restore(containerID string, _ StdioCallback, unusedOnWindows ...CreateOption) error { + logrus.Debugf("libcontainerd: Restore(%s)", containerID) + + // TODO Windows: On RS1, a re-attach isn't possible. + // However, there is a scenario in which there is an issue. + // Consider a background container. The daemon dies unexpectedly. + // HCS will still have the compute service alive and running. + // For consistence, we call in to shoot it regardless if HCS knows about it + // We explicitly just log a warning if the terminate fails. + // Then we tell the backend the container exited. + if hc, err := hcsshim.OpenContainer(containerID); err == nil { + const terminateTimeout = time.Minute * 2 + err := hc.Terminate() + + if hcsshim.IsPending(err) { + err = hc.WaitTimeout(terminateTimeout) + } else if hcsshim.IsAlreadyStopped(err) { + err = nil + } + + if err != nil { + logrus.Warnf("libcontainerd: failed to terminate %s on restore - %q", containerID, err) + return err + } + } + return clnt.backend.StateChanged(containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateExit, + ExitCode: 1 << 31, + }}) +} + +// GetPidsForContainer returns a list of process IDs running in a container. +// Not used on Windows. +func (clnt *client) GetPidsForContainer(containerID string) ([]int, error) { + return nil, errors.New("not implemented on Windows") +} + +// Summary returns a summary of the processes running in a container. +// This is present in Windows to support docker top. In linux, the +// engine shells out to ps to get process information. On Windows, as +// the containers could be Hyper-V containers, they would not be +// visible on the container host. However, libcontainerd does have +// that information. +func (clnt *client) Summary(containerID string) ([]Summary, error) { + + // Get the libcontainerd container object + clnt.lock(containerID) + defer clnt.unlock(containerID) + container, err := clnt.getContainer(containerID) + if err != nil { + return nil, err + } + p, err := container.hcsContainer.ProcessList() + if err != nil { + return nil, err + } + pl := make([]Summary, len(p)) + for i := range p { + pl[i] = Summary(p[i]) + } + return pl, nil +} + +// UpdateResources updates resources for a running container. +func (clnt *client) UpdateResources(containerID string, resources Resources) error { + // Updating resource isn't supported on Windows + // but we should return nil for enabling updating container + return nil +} + +func (clnt *client) CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error { + return errors.New("Windows: Containers do not support checkpoints") +} + +func (clnt *client) DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error { + return errors.New("Windows: Containers do not support checkpoints") +} + +func (clnt *client) ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error) { + return nil, errors.New("Windows: Containers do not support checkpoints") +} + +func (clnt *client) GetServerVersion(ctx context.Context) (*ServerVersion, error) { + return &ServerVersion{}, nil +} diff --git a/libcontainerd/container.go b/libcontainerd/container.go new file mode 100644 index 0000000..b403213 --- /dev/null +++ b/libcontainerd/container.go @@ -0,0 +1,13 @@ +package libcontainerd + +const ( + // InitFriendlyName is the name given in the lookup map of processes + // for the first process started in a container. + InitFriendlyName = "init" + configFilename = "config.json" +) + +type containerCommon struct { + process + processes map[string]*process +} diff --git a/libcontainerd/container_unix.go b/libcontainerd/container_unix.go new file mode 100644 index 0000000..be16999 --- /dev/null +++ b/libcontainerd/container_unix.go @@ -0,0 +1,246 @@ +// +build linux solaris + +package libcontainerd + +import ( + "encoding/json" + "io" + "io/ioutil" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/Sirupsen/logrus" + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/docker/docker/pkg/ioutils" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/tonistiigi/fifo" + "golang.org/x/net/context" +) + +type container struct { + containerCommon + + // Platform specific fields are below here. + pauseMonitor + oom bool + runtime string + runtimeArgs []string +} + +type runtime struct { + path string + args []string +} + +// WithRuntime sets the runtime to be used for the created container +func WithRuntime(path string, args []string) CreateOption { + return runtime{path, args} +} + +func (rt runtime) Apply(p interface{}) error { + if pr, ok := p.(*container); ok { + pr.runtime = rt.path + pr.runtimeArgs = rt.args + } + return nil +} + +func (ctr *container) clean() error { + if os.Getenv("LIBCONTAINERD_NOCLEAN") == "1" { + return nil + } + if _, err := os.Lstat(ctr.dir); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + + if err := os.RemoveAll(ctr.dir); err != nil { + return err + } + return nil +} + +// cleanProcess removes the fifos used by an additional process. +// Caller needs to lock container ID before calling this method. +func (ctr *container) cleanProcess(id string) { + if p, ok := ctr.processes[id]; ok { + for _, i := range []int{syscall.Stdin, syscall.Stdout, syscall.Stderr} { + if err := os.Remove(p.fifo(i)); err != nil && !os.IsNotExist(err) { + logrus.Warnf("libcontainerd: failed to remove %v for process %v: %v", p.fifo(i), id, err) + } + } + } + delete(ctr.processes, id) +} + +func (ctr *container) spec() (*specs.Spec, error) { + var spec specs.Spec + dt, err := ioutil.ReadFile(filepath.Join(ctr.dir, configFilename)) + if err != nil { + return nil, err + } + if err := json.Unmarshal(dt, &spec); err != nil { + return nil, err + } + return &spec, nil +} + +func (ctr *container) start(spec *specs.Spec, checkpoint, checkpointDir string, attachStdio StdioCallback) (err error) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ready := make(chan struct{}) + + fifoCtx, cancel := context.WithCancel(context.Background()) + defer func() { + if err != nil { + cancel() + } + }() + + iopipe, err := ctr.openFifos(fifoCtx, spec.Process.Terminal) + if err != nil { + return err + } + + var stdinOnce sync.Once + + // we need to delay stdin closure after container start or else "stdin close" + // event will be rejected by containerd. + // stdin closure happens in attachStdio + stdin := iopipe.Stdin + iopipe.Stdin = ioutils.NewWriteCloserWrapper(stdin, func() error { + var err error + stdinOnce.Do(func() { // on error from attach we don't know if stdin was already closed + err = stdin.Close() + go func() { + select { + case <-ready: + case <-ctx.Done(): + } + select { + case <-ready: + if err := ctr.sendCloseStdin(); err != nil { + logrus.Warnf("failed to close stdin: %+v", err) + } + default: + } + }() + }) + return err + }) + + r := &containerd.CreateContainerRequest{ + Id: ctr.containerID, + BundlePath: ctr.dir, + Stdin: ctr.fifo(syscall.Stdin), + Stdout: ctr.fifo(syscall.Stdout), + Stderr: ctr.fifo(syscall.Stderr), + Checkpoint: checkpoint, + CheckpointDir: checkpointDir, + // check to see if we are running in ramdisk to disable pivot root + NoPivotRoot: os.Getenv("DOCKER_RAMDISK") != "", + Runtime: ctr.runtime, + RuntimeArgs: ctr.runtimeArgs, + } + ctr.client.appendContainer(ctr) + + if err := attachStdio(*iopipe); err != nil { + ctr.closeFifos(iopipe) + return err + } + + resp, err := ctr.client.remote.apiClient.CreateContainer(context.Background(), r) + if err != nil { + ctr.closeFifos(iopipe) + return err + } + ctr.systemPid = systemPid(resp.Container) + close(ready) + + return ctr.client.backend.StateChanged(ctr.containerID, StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateStart, + Pid: ctr.systemPid, + }}) + +} + +func (ctr *container) newProcess(friendlyName string) *process { + return &process{ + dir: ctr.dir, + processCommon: processCommon{ + containerID: ctr.containerID, + friendlyName: friendlyName, + client: ctr.client, + }, + } +} + +func (ctr *container) handleEvent(e *containerd.Event) error { + ctr.client.lock(ctr.containerID) + defer ctr.client.unlock(ctr.containerID) + switch e.Type { + case StateExit, StatePause, StateResume, StateOOM: + st := StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: e.Type, + ExitCode: e.Status, + }, + OOMKilled: e.Type == StateExit && ctr.oom, + } + if e.Type == StateOOM { + ctr.oom = true + } + if e.Type == StateExit && e.Pid != InitFriendlyName { + st.ProcessID = e.Pid + st.State = StateExitProcess + } + + // Remove process from list if we have exited + switch st.State { + case StateExit: + ctr.clean() + ctr.client.deleteContainer(e.Id) + case StateExitProcess: + ctr.cleanProcess(st.ProcessID) + } + ctr.client.q.append(e.Id, func() { + if err := ctr.client.backend.StateChanged(e.Id, st); err != nil { + logrus.Errorf("libcontainerd: backend.StateChanged(): %v", err) + } + if e.Type == StatePause || e.Type == StateResume { + ctr.pauseMonitor.handle(e.Type) + } + if e.Type == StateExit { + if en := ctr.client.getExitNotifier(e.Id); en != nil { + en.close() + } + } + }) + + default: + logrus.Debugf("libcontainerd: event unhandled: %+v", e) + } + return nil +} + +// discardFifos attempts to fully read the container fifos to unblock processes +// that may be blocked on the writer side. +func (ctr *container) discardFifos() { + ctx, _ := context.WithTimeout(context.Background(), 3*time.Second) + for _, i := range []int{syscall.Stdout, syscall.Stderr} { + f, err := fifo.OpenFifo(ctx, ctr.fifo(i), syscall.O_RDONLY|syscall.O_NONBLOCK, 0) + if err != nil { + logrus.Warnf("error opening fifo %v for discarding: %+v", f, err) + continue + } + go func() { + io.Copy(ioutil.Discard, f) + }() + } +} diff --git a/libcontainerd/container_windows.go b/libcontainerd/container_windows.go new file mode 100644 index 0000000..af3e0ef --- /dev/null +++ b/libcontainerd/container_windows.go @@ -0,0 +1,330 @@ +package libcontainerd + +import ( + "encoding/json" + "fmt" + "io" + "io/ioutil" + "strings" + "syscall" + "time" + + "github.com/Microsoft/hcsshim" + "github.com/Sirupsen/logrus" + "github.com/docker/docker/pkg/system" + "github.com/opencontainers/runtime-spec/specs-go" +) + +type container struct { + containerCommon + + // Platform specific fields are below here. There are none presently on Windows. + options []CreateOption + + // The ociSpec is required, as client.Create() needs a spec, + // but can be called from the RestartManager context which does not + // otherwise have access to the Spec + ociSpec specs.Spec + + manualStopRequested bool + hcsContainer hcsshim.Container +} + +func (ctr *container) newProcess(friendlyName string) *process { + return &process{ + processCommon: processCommon{ + containerID: ctr.containerID, + friendlyName: friendlyName, + client: ctr.client, + }, + } +} + +// start starts a created container. +// Caller needs to lock container ID before calling this method. +func (ctr *container) start(attachStdio StdioCallback) error { + var err error + isServicing := false + + for _, option := range ctr.options { + if s, ok := option.(*ServicingOption); ok && s.IsServicing { + isServicing = true + } + } + + // Start the container. If this is a servicing container, this call will block + // until the container is done with the servicing execution. + logrus.Debugln("libcontainerd: starting container ", ctr.containerID) + if err = ctr.hcsContainer.Start(); err != nil { + logrus.Errorf("libcontainerd: failed to start container: %s", err) + if err := ctr.terminate(); err != nil { + logrus.Errorf("libcontainerd: failed to cleanup after a failed Start. %s", err) + } else { + logrus.Debugln("libcontainerd: cleaned up after failed Start by calling Terminate") + } + return err + } + + // Note we always tell HCS to + // create stdout as it's required regardless of '-i' or '-t' options, so that + // docker can always grab the output through logs. We also tell HCS to always + // create stdin, even if it's not used - it will be closed shortly. Stderr + // is only created if it we're not -t. + createProcessParms := &hcsshim.ProcessConfig{ + EmulateConsole: ctr.ociSpec.Process.Terminal, + WorkingDirectory: ctr.ociSpec.Process.Cwd, + CreateStdInPipe: !isServicing, + CreateStdOutPipe: !isServicing, + CreateStdErrPipe: !ctr.ociSpec.Process.Terminal && !isServicing, + } + createProcessParms.ConsoleSize[0] = uint(ctr.ociSpec.Process.ConsoleSize.Height) + createProcessParms.ConsoleSize[1] = uint(ctr.ociSpec.Process.ConsoleSize.Width) + + // Configure the environment for the process + createProcessParms.Environment = setupEnvironmentVariables(ctr.ociSpec.Process.Env) + if ctr.ociSpec.Platform.OS == "windows" { + createProcessParms.CommandLine = strings.Join(ctr.ociSpec.Process.Args, " ") + } else { + createProcessParms.CommandArgs = ctr.ociSpec.Process.Args + } + createProcessParms.User = ctr.ociSpec.Process.User.Username + + // LCOW requires the raw OCI spec passed through HCS and onwards to GCS for the utility VM. + if system.LCOWSupported() && ctr.ociSpec.Platform.OS == "linux" { + ociBuf, err := json.Marshal(ctr.ociSpec) + if err != nil { + return err + } + ociRaw := json.RawMessage(ociBuf) + createProcessParms.OCISpecification = &ociRaw + } + + // Start the command running in the container. + newProcess, err := ctr.hcsContainer.CreateProcess(createProcessParms) + if err != nil { + logrus.Errorf("libcontainerd: CreateProcess() failed %s", err) + if err := ctr.terminate(); err != nil { + logrus.Errorf("libcontainerd: failed to cleanup after a failed CreateProcess. %s", err) + } else { + logrus.Debugln("libcontainerd: cleaned up after failed CreateProcess by calling Terminate") + } + return err + } + + pid := newProcess.Pid() + + // Save the hcs Process and PID + ctr.process.friendlyName = InitFriendlyName + ctr.process.hcsProcess = newProcess + + // If this is a servicing container, wait on the process synchronously here and + // if it succeeds, wait for it cleanly shutdown and merge into the parent container. + if isServicing { + exitCode := ctr.waitProcessExitCode(&ctr.process) + + if exitCode != 0 { + if err := ctr.terminate(); err != nil { + logrus.Warnf("libcontainerd: terminating servicing container %s failed: %s", ctr.containerID, err) + } + return fmt.Errorf("libcontainerd: servicing container %s returned non-zero exit code %d", ctr.containerID, exitCode) + } + + return ctr.hcsContainer.WaitTimeout(time.Minute * 5) + } + + var stdout, stderr io.ReadCloser + var stdin io.WriteCloser + stdin, stdout, stderr, err = newProcess.Stdio() + if err != nil { + logrus.Errorf("libcontainerd: failed to get stdio pipes: %s", err) + if err := ctr.terminate(); err != nil { + logrus.Errorf("libcontainerd: failed to cleanup after a failed Stdio. %s", err) + } + return err + } + + iopipe := &IOPipe{Terminal: ctr.ociSpec.Process.Terminal} + + iopipe.Stdin = createStdInCloser(stdin, newProcess) + + // Convert io.ReadClosers to io.Readers + if stdout != nil { + iopipe.Stdout = ioutil.NopCloser(&autoClosingReader{ReadCloser: stdout}) + } + if stderr != nil { + iopipe.Stderr = ioutil.NopCloser(&autoClosingReader{ReadCloser: stderr}) + } + + // Save the PID + logrus.Debugf("libcontainerd: process started - PID %d", pid) + ctr.systemPid = uint32(pid) + + // Spin up a go routine waiting for exit to handle cleanup + go ctr.waitExit(&ctr.process, true) + + ctr.client.appendContainer(ctr) + + if err := attachStdio(*iopipe); err != nil { + // OK to return the error here, as waitExit will handle tear-down in HCS + return err + } + + // Tell the docker engine that the container has started. + si := StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateStart, + Pid: ctr.systemPid, // Not sure this is needed? Double-check monitor.go in daemon BUGBUG @jhowardmsft + }} + logrus.Debugf("libcontainerd: start() completed OK, %+v", si) + return ctr.client.backend.StateChanged(ctr.containerID, si) + +} + +// waitProcessExitCode will wait for the given process to exit and return its error code. +func (ctr *container) waitProcessExitCode(process *process) int { + // Block indefinitely for the process to exit. + err := process.hcsProcess.Wait() + if err != nil { + if herr, ok := err.(*hcsshim.ProcessError); ok && herr.Err != syscall.ERROR_BROKEN_PIPE { + logrus.Warnf("libcontainerd: Wait() failed (container may have been killed): %s", err) + } + // Fall through here, do not return. This ensures we attempt to continue the + // shutdown in HCS and tell the docker engine that the process/container + // has exited to avoid a container being dropped on the floor. + } + + exitCode, err := process.hcsProcess.ExitCode() + if err != nil { + if herr, ok := err.(*hcsshim.ProcessError); ok && herr.Err != syscall.ERROR_BROKEN_PIPE { + logrus.Warnf("libcontainerd: unable to get exit code from container %s", ctr.containerID) + } + // Since we got an error retrieving the exit code, make sure that the code we return + // doesn't incorrectly indicate success. + exitCode = -1 + + // Fall through here, do not return. This ensures we attempt to continue the + // shutdown in HCS and tell the docker engine that the process/container + // has exited to avoid a container being dropped on the floor. + } + + return exitCode +} + +// waitExit runs as a goroutine waiting for the process to exit. It's +// equivalent to (in the linux containerd world) where events come in for +// state change notifications from containerd. +func (ctr *container) waitExit(process *process, isFirstProcessToStart bool) error { + logrus.Debugln("libcontainerd: waitExit() on pid", process.systemPid) + + exitCode := ctr.waitProcessExitCode(process) + // Lock the container while removing the process/container from the list + ctr.client.lock(ctr.containerID) + + if !isFirstProcessToStart { + ctr.cleanProcess(process.friendlyName) + } else { + ctr.client.deleteContainer(ctr.containerID) + } + + // Unlock here so other threads are unblocked + ctr.client.unlock(ctr.containerID) + + // Assume the container has exited + si := StateInfo{ + CommonStateInfo: CommonStateInfo{ + State: StateExit, + ExitCode: uint32(exitCode), + Pid: process.systemPid, + ProcessID: process.friendlyName, + }, + UpdatePending: false, + } + + // But it could have been an exec'd process which exited + if !isFirstProcessToStart { + si.State = StateExitProcess + } else { + // Pending updates is only applicable for WCOW + if ctr.ociSpec.Platform.OS == "windows" { + updatePending, err := ctr.hcsContainer.HasPendingUpdates() + if err != nil { + logrus.Warnf("libcontainerd: HasPendingUpdates() failed (container may have been killed): %s", err) + } else { + si.UpdatePending = updatePending + } + } + + logrus.Debugf("libcontainerd: shutting down container %s", ctr.containerID) + if err := ctr.shutdown(); err != nil { + logrus.Debugf("libcontainerd: failed to shutdown container %s", ctr.containerID) + } else { + logrus.Debugf("libcontainerd: completed shutting down container %s", ctr.containerID) + } + if err := ctr.hcsContainer.Close(); err != nil { + logrus.Error(err) + } + } + + if err := process.hcsProcess.Close(); err != nil { + logrus.Errorf("libcontainerd: hcsProcess.Close(): %v", err) + } + + // Call into the backend to notify it of the state change. + logrus.Debugf("libcontainerd: waitExit() calling backend.StateChanged %+v", si) + if err := ctr.client.backend.StateChanged(ctr.containerID, si); err != nil { + logrus.Error(err) + } + + logrus.Debugf("libcontainerd: waitExit() completed OK, %+v", si) + + return nil +} + +// cleanProcess removes process from the map. +// Caller needs to lock container ID before calling this method. +func (ctr *container) cleanProcess(id string) { + delete(ctr.processes, id) +} + +// shutdown shuts down the container in HCS +// Caller needs to lock container ID before calling this method. +func (ctr *container) shutdown() error { + const shutdownTimeout = time.Minute * 5 + err := ctr.hcsContainer.Shutdown() + if hcsshim.IsPending(err) { + // Explicit timeout to avoid a (remote) possibility that shutdown hangs indefinitely. + err = ctr.hcsContainer.WaitTimeout(shutdownTimeout) + } else if hcsshim.IsAlreadyStopped(err) { + err = nil + } + + if err != nil { + logrus.Debugf("libcontainerd: error shutting down container %s %v calling terminate", ctr.containerID, err) + if err := ctr.terminate(); err != nil { + return err + } + return err + } + + return nil +} + +// terminate terminates the container in HCS +// Caller needs to lock container ID before calling this method. +func (ctr *container) terminate() error { + const terminateTimeout = time.Minute * 5 + err := ctr.hcsContainer.Terminate() + + if hcsshim.IsPending(err) { + err = ctr.hcsContainer.WaitTimeout(terminateTimeout) + } else if hcsshim.IsAlreadyStopped(err) { + err = nil + } + + if err != nil { + logrus.Debugf("libcontainerd: error terminating container %s %v", ctr.containerID, err) + return err + } + + return nil +} diff --git a/libcontainerd/oom_linux.go b/libcontainerd/oom_linux.go new file mode 100644 index 0000000..e126b7a --- /dev/null +++ b/libcontainerd/oom_linux.go @@ -0,0 +1,31 @@ +package libcontainerd + +import ( + "fmt" + "os" + "strconv" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/system" +) + +func setOOMScore(pid, score int) error { + oomScoreAdjPath := fmt.Sprintf("/proc/%d/oom_score_adj", pid) + f, err := os.OpenFile(oomScoreAdjPath, os.O_WRONLY, 0) + if err != nil { + return err + } + stringScore := strconv.Itoa(score) + _, err = f.WriteString(stringScore) + f.Close() + if os.IsPermission(err) { + // Setting oom_score_adj does not work in an + // unprivileged container. Ignore the error, but log + // it if we appear not to be in that situation. + if !system.RunningInUserNS() { + logrus.Debugf("Permission denied writing %q to %s", stringScore, oomScoreAdjPath) + } + return nil + } + return err +} diff --git a/libcontainerd/oom_solaris.go b/libcontainerd/oom_solaris.go new file mode 100644 index 0000000..2ebe5e8 --- /dev/null +++ b/libcontainerd/oom_solaris.go @@ -0,0 +1,5 @@ +package libcontainerd + +func setOOMScore(pid, score int) error { + return nil +} diff --git a/libcontainerd/pausemonitor_unix.go b/libcontainerd/pausemonitor_unix.go new file mode 100644 index 0000000..4f3766d --- /dev/null +++ b/libcontainerd/pausemonitor_unix.go @@ -0,0 +1,42 @@ +// +build !windows + +package libcontainerd + +import ( + "sync" +) + +// pauseMonitor is helper to get notifications from pause state changes. +type pauseMonitor struct { + sync.Mutex + waiters map[string][]chan struct{} +} + +func (m *pauseMonitor) handle(t string) { + m.Lock() + defer m.Unlock() + if m.waiters == nil { + return + } + q, ok := m.waiters[t] + if !ok { + return + } + if len(q) > 0 { + close(q[0]) + m.waiters[t] = q[1:] + } +} + +func (m *pauseMonitor) append(t string, waiter chan struct{}) { + m.Lock() + defer m.Unlock() + if m.waiters == nil { + m.waiters = make(map[string][]chan struct{}) + } + _, ok := m.waiters[t] + if !ok { + m.waiters[t] = make([]chan struct{}, 0) + } + m.waiters[t] = append(m.waiters[t], waiter) +} diff --git a/libcontainerd/process.go b/libcontainerd/process.go new file mode 100644 index 0000000..57562c8 --- /dev/null +++ b/libcontainerd/process.go @@ -0,0 +1,18 @@ +package libcontainerd + +// processCommon are the platform common fields as part of the process structure +// which keeps the state for the main container process, as well as any exec +// processes. +type processCommon struct { + client *client + + // containerID is the Container ID + containerID string + + // friendlyName is an identifier for the process (or `InitFriendlyName` + // for the first process) + friendlyName string + + // systemPid is the PID of the main container process + systemPid uint32 +} diff --git a/libcontainerd/process_unix.go b/libcontainerd/process_unix.go new file mode 100644 index 0000000..3b54e32 --- /dev/null +++ b/libcontainerd/process_unix.go @@ -0,0 +1,107 @@ +// +build linux solaris + +package libcontainerd + +import ( + "io" + "io/ioutil" + "os" + "path/filepath" + goruntime "runtime" + "strings" + + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/tonistiigi/fifo" + "golang.org/x/net/context" + "golang.org/x/sys/unix" +) + +var fdNames = map[int]string{ + unix.Stdin: "stdin", + unix.Stdout: "stdout", + unix.Stderr: "stderr", +} + +// process keeps the state for both main container process and exec process. +type process struct { + processCommon + + // Platform specific fields are below here. + dir string +} + +func (p *process) openFifos(ctx context.Context, terminal bool) (pipe *IOPipe, err error) { + if err := os.MkdirAll(p.dir, 0700); err != nil { + return nil, err + } + + io := &IOPipe{} + + io.Stdin, err = fifo.OpenFifo(ctx, p.fifo(unix.Stdin), unix.O_WRONLY|unix.O_CREAT|unix.O_NONBLOCK, 0700) + if err != nil { + return nil, err + } + + defer func() { + if err != nil { + io.Stdin.Close() + } + }() + + io.Stdout, err = fifo.OpenFifo(ctx, p.fifo(unix.Stdout), unix.O_RDONLY|unix.O_CREAT|unix.O_NONBLOCK, 0700) + if err != nil { + return nil, err + } + + defer func() { + if err != nil { + io.Stdout.Close() + } + }() + + if goruntime.GOOS == "solaris" || !terminal { + // For Solaris terminal handling is done exclusively by the runtime therefore we make no distinction + // in the processing for terminal and !terminal cases. + io.Stderr, err = fifo.OpenFifo(ctx, p.fifo(unix.Stderr), unix.O_RDONLY|unix.O_CREAT|unix.O_NONBLOCK, 0700) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + io.Stderr.Close() + } + }() + } else { + io.Stderr = ioutil.NopCloser(emptyReader{}) + } + + return io, nil +} + +func (p *process) sendCloseStdin() error { + _, err := p.client.remote.apiClient.UpdateProcess(context.Background(), &containerd.UpdateProcessRequest{ + Id: p.containerID, + Pid: p.friendlyName, + CloseStdin: true, + }) + if err != nil && (strings.Contains(err.Error(), "container not found") || strings.Contains(err.Error(), "process not found")) { + return nil + } + return err +} + +func (p *process) closeFifos(io *IOPipe) { + io.Stdin.Close() + io.Stdout.Close() + io.Stderr.Close() +} + +type emptyReader struct{} + +func (r emptyReader) Read(b []byte) (int, error) { + return 0, io.EOF +} + +func (p *process) fifo(index int) string { + return filepath.Join(p.dir, p.friendlyName+"-"+fdNames[index]) +} diff --git a/libcontainerd/process_windows.go b/libcontainerd/process_windows.go new file mode 100644 index 0000000..854c4dd --- /dev/null +++ b/libcontainerd/process_windows.go @@ -0,0 +1,48 @@ +package libcontainerd + +import ( + "io" + "sync" + + "github.com/Microsoft/hcsshim" + "github.com/docker/docker/pkg/ioutils" +) + +// process keeps the state for both main container process and exec process. +type process struct { + processCommon + + // Platform specific fields are below here. + hcsProcess hcsshim.Process +} + +type autoClosingReader struct { + io.ReadCloser + sync.Once +} + +func (r *autoClosingReader) Read(b []byte) (n int, err error) { + n, err = r.ReadCloser.Read(b) + if err == io.EOF { + r.Once.Do(func() { r.ReadCloser.Close() }) + } + return +} + +func createStdInCloser(pipe io.WriteCloser, process hcsshim.Process) io.WriteCloser { + return ioutils.NewWriteCloserWrapper(pipe, func() error { + if err := pipe.Close(); err != nil { + return err + } + + err := process.CloseStdin() + if err != nil && !hcsshim.IsNotExist(err) && !hcsshim.IsAlreadyClosed(err) { + // This error will occur if the compute system is currently shutting down + if perr, ok := err.(*hcsshim.ProcessError); ok && perr.Err != hcsshim.ErrVmcomputeOperationInvalidState { + return err + } + } + + return nil + }) +} diff --git a/libcontainerd/queue_unix.go b/libcontainerd/queue_unix.go new file mode 100644 index 0000000..66765f7 --- /dev/null +++ b/libcontainerd/queue_unix.go @@ -0,0 +1,37 @@ +// +build linux solaris + +package libcontainerd + +import "sync" + +type queue struct { + sync.Mutex + fns map[string]chan struct{} +} + +func (q *queue) append(id string, f func()) { + q.Lock() + defer q.Unlock() + + if q.fns == nil { + q.fns = make(map[string]chan struct{}) + } + + done := make(chan struct{}) + + fn, ok := q.fns[id] + q.fns[id] = done + go func() { + if ok { + <-fn + } + f() + close(done) + + q.Lock() + if q.fns[id] == done { + delete(q.fns, id) + } + q.Unlock() + }() +} diff --git a/libcontainerd/queue_unix_test.go b/libcontainerd/queue_unix_test.go new file mode 100644 index 0000000..bb49a5d --- /dev/null +++ b/libcontainerd/queue_unix_test.go @@ -0,0 +1,33 @@ +// +build linux solaris + +package libcontainerd + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSerialization(t *testing.T) { + var ( + q queue + serialization = 1 + ) + + q.append("aaa", func() { + //simulate a long time task + time.Sleep(10 * time.Millisecond) + require.EqualValues(t, serialization, 1) + serialization = 2 + }) + q.append("aaa", func() { + require.EqualValues(t, serialization, 2) + serialization = 3 + }) + q.append("aaa", func() { + require.EqualValues(t, serialization, 3) + serialization = 4 + }) + time.Sleep(20 * time.Millisecond) +} diff --git a/libcontainerd/remote.go b/libcontainerd/remote.go new file mode 100644 index 0000000..9031e3a --- /dev/null +++ b/libcontainerd/remote.go @@ -0,0 +1,20 @@ +package libcontainerd + +// Remote on Linux defines the accesspoint to the containerd grpc API. +// Remote on Windows is largely an unimplemented interface as there is +// no remote containerd. +type Remote interface { + // Client returns a new Client instance connected with given Backend. + Client(Backend) (Client, error) + // Cleanup stops containerd if it was started by libcontainerd. + // Note this is not used on Windows as there is no remote containerd. + Cleanup() + // UpdateOptions allows various remote options to be updated at runtime. + UpdateOptions(...RemoteOption) error +} + +// RemoteOption allows to configure parameters of remotes. +// This is unused on Windows. +type RemoteOption interface { + Apply(Remote) error +} diff --git a/libcontainerd/remote_unix.go b/libcontainerd/remote_unix.go new file mode 100644 index 0000000..a81a93c --- /dev/null +++ b/libcontainerd/remote_unix.go @@ -0,0 +1,565 @@ +// +build linux solaris + +package libcontainerd + +import ( + "fmt" + "io" + "io/ioutil" + "log" + "net" + "os" + "os/exec" + "path/filepath" + goruntime "runtime" + "strconv" + "strings" + "sync" + "syscall" + "time" + + "github.com/Sirupsen/logrus" + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/docker/docker/pkg/locker" + "github.com/docker/docker/pkg/system" + "github.com/golang/protobuf/ptypes" + "github.com/golang/protobuf/ptypes/timestamp" + "golang.org/x/net/context" + "google.golang.org/grpc" + "google.golang.org/grpc/grpclog" + "google.golang.org/grpc/health/grpc_health_v1" + "google.golang.org/grpc/transport" +) + +const ( + maxConnectionRetryCount = 3 + containerdHealthCheckTimeout = 3 * time.Second + containerdShutdownTimeout = 15 * time.Second + containerdBinary = "docker-containerd" + containerdPidFilename = "docker-containerd.pid" + containerdSockFilename = "docker-containerd.sock" + containerdStateDir = "containerd" + eventTimestampFilename = "event.ts" +) + +type remote struct { + sync.RWMutex + apiClient containerd.APIClient + daemonPid int + stateDir string + rpcAddr string + startDaemon bool + closedManually bool + debugLog bool + rpcConn *grpc.ClientConn + clients []*client + eventTsPath string + runtime string + runtimeArgs []string + daemonWaitCh chan struct{} + liveRestore bool + oomScore int + restoreFromTimestamp *timestamp.Timestamp +} + +// New creates a fresh instance of libcontainerd remote. +func New(stateDir string, options ...RemoteOption) (_ Remote, err error) { + defer func() { + if err != nil { + err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specified the correct address. Got error: %v", err) + } + }() + r := &remote{ + stateDir: stateDir, + daemonPid: -1, + eventTsPath: filepath.Join(stateDir, eventTimestampFilename), + } + for _, option := range options { + if err := option.Apply(r); err != nil { + return nil, err + } + } + + if err := system.MkdirAll(stateDir, 0700, ""); err != nil { + return nil, err + } + + if r.rpcAddr == "" { + r.rpcAddr = filepath.Join(stateDir, containerdSockFilename) + } + + if r.startDaemon { + if err := r.runContainerdDaemon(); err != nil { + return nil, err + } + } + + // don't output the grpc reconnect logging + grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags)) + dialOpts := []grpc.DialOption{ + grpc.WithInsecure(), + grpc.WithBackoffMaxDelay(2 * time.Second), + grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("unix", addr, timeout) + }), + } + conn, err := grpc.Dial(r.rpcAddr, dialOpts...) + if err != nil { + return nil, fmt.Errorf("error connecting to containerd: %v", err) + } + + r.rpcConn = conn + r.apiClient = containerd.NewAPIClient(conn) + + // Get the timestamp to restore from + t := r.getLastEventTimestamp() + tsp, err := ptypes.TimestampProto(t) + if err != nil { + logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err) + } + r.restoreFromTimestamp = tsp + + go r.handleConnectionChange() + + if err := r.startEventsMonitor(); err != nil { + return nil, err + } + + return r, nil +} + +func (r *remote) UpdateOptions(options ...RemoteOption) error { + for _, option := range options { + if err := option.Apply(r); err != nil { + return err + } + } + return nil +} + +func (r *remote) handleConnectionChange() { + var transientFailureCount = 0 + + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + healthClient := grpc_health_v1.NewHealthClient(r.rpcConn) + + for { + <-ticker.C + ctx, cancel := context.WithTimeout(context.Background(), containerdHealthCheckTimeout) + _, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{}) + cancel() + if err == nil { + continue + } + + logrus.Debugf("libcontainerd: containerd health check returned error: %v", err) + + if r.daemonPid != -1 { + if r.closedManually { + // Well, we asked for it to stop, just return + return + } + // all other errors are transient + // Reset state to be notified of next failure + transientFailureCount++ + if transientFailureCount >= maxConnectionRetryCount { + transientFailureCount = 0 + if system.IsProcessAlive(r.daemonPid) { + system.KillProcess(r.daemonPid) + } + <-r.daemonWaitCh + if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error + logrus.Errorf("libcontainerd: error restarting containerd: %v", err) + } + continue + } + } + } +} + +func (r *remote) Cleanup() { + if r.daemonPid == -1 { + return + } + r.closedManually = true + r.rpcConn.Close() + // Ask the daemon to quit + syscall.Kill(r.daemonPid, syscall.SIGTERM) + + // Wait up to 15secs for it to stop + for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second { + if !system.IsProcessAlive(r.daemonPid) { + break + } + time.Sleep(time.Second) + } + + if system.IsProcessAlive(r.daemonPid) { + logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid) + syscall.Kill(r.daemonPid, syscall.SIGKILL) + } + + // cleanup some files + os.Remove(filepath.Join(r.stateDir, containerdPidFilename)) + os.Remove(filepath.Join(r.stateDir, containerdSockFilename)) +} + +func (r *remote) Client(b Backend) (Client, error) { + c := &client{ + clientCommon: clientCommon{ + backend: b, + containers: make(map[string]*container), + locker: locker.New(), + }, + remote: r, + exitNotifiers: make(map[string]*exitNotifier), + liveRestore: r.liveRestore, + } + + r.Lock() + r.clients = append(r.clients, c) + r.Unlock() + return c, nil +} + +func (r *remote) updateEventTimestamp(t time.Time) { + f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600) + if err != nil { + logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err) + return + } + defer f.Close() + + b, err := t.MarshalText() + if err != nil { + logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err) + return + } + + n, err := f.Write(b) + if err != nil || n != len(b) { + logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err) + f.Truncate(0) + return + } +} + +func (r *remote) getLastEventTimestamp() time.Time { + t := time.Now() + + fi, err := os.Stat(r.eventTsPath) + if os.IsNotExist(err) || fi.Size() == 0 { + return t + } + + f, err := os.Open(r.eventTsPath) + if err != nil { + logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err) + return t + } + defer f.Close() + + b := make([]byte, fi.Size()) + n, err := f.Read(b) + if err != nil || n != len(b) { + logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err) + return t + } + + t.UnmarshalText(b) + + return t +} + +func (r *remote) startEventsMonitor() error { + // First, get past events + t := r.getLastEventTimestamp() + tsp, err := ptypes.TimestampProto(t) + if err != nil { + logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err) + } + er := &containerd.EventsRequest{ + Timestamp: tsp, + } + + var events containerd.API_EventsClient + for { + events, err = r.apiClient.Events(context.Background(), er, grpc.FailFast(false)) + if err == nil { + break + } + logrus.Warnf("libcontainerd: failed to get events from containerd: %q", err) + + if r.closedManually { + // ignore error if grpc remote connection is closed manually + return nil + } + + <-time.After(100 * time.Millisecond) + } + + go r.handleEventStream(events) + return nil +} + +func (r *remote) handleEventStream(events containerd.API_EventsClient) { + for { + e, err := events.Recv() + if err != nil { + if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc && + r.closedManually { + // ignore error if grpc remote connection is closed manually + return + } + logrus.Errorf("libcontainerd: failed to receive event from containerd: %v", err) + go r.startEventsMonitor() + return + } + + logrus.Debugf("libcontainerd: received containerd event: %#v", e) + + var container *container + var c *client + r.RLock() + for _, c = range r.clients { + container, err = c.getContainer(e.Id) + if err == nil { + break + } + } + r.RUnlock() + if container == nil { + logrus.Warnf("libcontainerd: unknown container %s", e.Id) + continue + } + + if err := container.handleEvent(e); err != nil { + logrus.Errorf("libcontainerd: error processing state change for %s: %v", e.Id, err) + } + + tsp, err := ptypes.Timestamp(e.Timestamp) + if err != nil { + logrus.Errorf("libcontainerd: failed to convert event timestamp: %q", err) + continue + } + + r.updateEventTimestamp(tsp) + } +} + +func (r *remote) runContainerdDaemon() error { + pidFilename := filepath.Join(r.stateDir, containerdPidFilename) + f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600) + if err != nil { + return err + } + defer f.Close() + + // File exist, check if the daemon is alive + b := make([]byte, 8) + n, err := f.Read(b) + if err != nil && err != io.EOF { + return err + } + + if n > 0 { + pid, err := strconv.ParseUint(string(b[:n]), 10, 64) + if err != nil { + return err + } + if system.IsProcessAlive(int(pid)) { + logrus.Infof("libcontainerd: previous instance of containerd still alive (%d)", pid) + r.daemonPid = int(pid) + return nil + } + } + + // rewind the file + _, err = f.Seek(0, os.SEEK_SET) + if err != nil { + return err + } + + // Truncate it + err = f.Truncate(0) + if err != nil { + return err + } + + // Start a new instance + args := []string{ + "-l", fmt.Sprintf("unix://%s", r.rpcAddr), + "--metrics-interval=0", + "--start-timeout", "2m", + "--state-dir", filepath.Join(r.stateDir, containerdStateDir), + } + if goruntime.GOOS == "solaris" { + args = append(args, "--shim", "containerd-shim", "--runtime", "runc") + } else { + args = append(args, "--shim", "docker-containerd-shim") + if r.runtime != "" { + args = append(args, "--runtime") + args = append(args, r.runtime) + } + } + if r.debugLog { + args = append(args, "--debug") + } + if len(r.runtimeArgs) > 0 { + for _, v := range r.runtimeArgs { + args = append(args, "--runtime-args") + args = append(args, v) + } + logrus.Debugf("libcontainerd: runContainerdDaemon: runtimeArgs: %s", args) + } + + cmd := exec.Command(containerdBinary, args...) + // redirect containerd logs to docker logs + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.SysProcAttr = setSysProcAttr(true) + cmd.Env = nil + // clear the NOTIFY_SOCKET from the env when starting containerd + for _, e := range os.Environ() { + if !strings.HasPrefix(e, "NOTIFY_SOCKET") { + cmd.Env = append(cmd.Env, e) + } + } + if err := cmd.Start(); err != nil { + return err + } + + // unless strictly necessary, do not add anything in between here + // as the reaper goroutine below needs to kick in as soon as possible + // and any "return" from code paths added here will defeat the reaper + // process. + + r.daemonWaitCh = make(chan struct{}) + go func() { + cmd.Wait() + close(r.daemonWaitCh) + }() // Reap our child when needed + + logrus.Infof("libcontainerd: new containerd process, pid: %d", cmd.Process.Pid) + if err := setOOMScore(cmd.Process.Pid, r.oomScore); err != nil { + system.KillProcess(cmd.Process.Pid) + return err + } + if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil { + system.KillProcess(cmd.Process.Pid) + return err + } + + r.daemonPid = cmd.Process.Pid + return nil +} + +// WithRemoteAddr sets the external containerd socket to connect to. +func WithRemoteAddr(addr string) RemoteOption { + return rpcAddr(addr) +} + +type rpcAddr string + +func (a rpcAddr) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.rpcAddr = string(a) + return nil + } + return fmt.Errorf("WithRemoteAddr option not supported for this remote") +} + +// WithRuntimePath sets the path of the runtime to be used as the +// default by containerd +func WithRuntimePath(rt string) RemoteOption { + return runtimePath(rt) +} + +type runtimePath string + +func (rt runtimePath) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.runtime = string(rt) + return nil + } + return fmt.Errorf("WithRuntime option not supported for this remote") +} + +// WithRuntimeArgs sets the list of runtime args passed to containerd +func WithRuntimeArgs(args []string) RemoteOption { + return runtimeArgs(args) +} + +type runtimeArgs []string + +func (rt runtimeArgs) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.runtimeArgs = rt + return nil + } + return fmt.Errorf("WithRuntimeArgs option not supported for this remote") +} + +// WithStartDaemon defines if libcontainerd should also run containerd daemon. +func WithStartDaemon(start bool) RemoteOption { + return startDaemon(start) +} + +type startDaemon bool + +func (s startDaemon) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.startDaemon = bool(s) + return nil + } + return fmt.Errorf("WithStartDaemon option not supported for this remote") +} + +// WithDebugLog defines if containerd debug logs will be enabled for daemon. +func WithDebugLog(debug bool) RemoteOption { + return debugLog(debug) +} + +type debugLog bool + +func (d debugLog) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.debugLog = bool(d) + return nil + } + return fmt.Errorf("WithDebugLog option not supported for this remote") +} + +// WithLiveRestore defines if containers are stopped on shutdown or restored. +func WithLiveRestore(v bool) RemoteOption { + return liveRestore(v) +} + +type liveRestore bool + +func (l liveRestore) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.liveRestore = bool(l) + for _, c := range remote.clients { + c.liveRestore = bool(l) + } + return nil + } + return fmt.Errorf("WithLiveRestore option not supported for this remote") +} + +// WithOOMScore defines the oom_score_adj to set for the containerd process. +func WithOOMScore(score int) RemoteOption { + return oomScore(score) +} + +type oomScore int + +func (o oomScore) Apply(r Remote) error { + if remote, ok := r.(*remote); ok { + remote.oomScore = int(o) + return nil + } + return fmt.Errorf("WithOOMScore option not supported for this remote") +} diff --git a/libcontainerd/remote_windows.go b/libcontainerd/remote_windows.go new file mode 100644 index 0000000..74c1044 --- /dev/null +++ b/libcontainerd/remote_windows.go @@ -0,0 +1,36 @@ +package libcontainerd + +import "github.com/docker/docker/pkg/locker" + +type remote struct { +} + +func (r *remote) Client(b Backend) (Client, error) { + c := &client{ + clientCommon: clientCommon{ + backend: b, + containers: make(map[string]*container), + locker: locker.New(), + }, + } + return c, nil +} + +// Cleanup is a no-op on Windows. It is here to implement the interface. +func (r *remote) Cleanup() { +} + +func (r *remote) UpdateOptions(opts ...RemoteOption) error { + return nil +} + +// New creates a fresh instance of libcontainerd remote. On Windows, +// this is not used as there is no remote containerd process. +func New(_ string, _ ...RemoteOption) (Remote, error) { + return &remote{}, nil +} + +// WithLiveRestore is a noop on windows. +func WithLiveRestore(v bool) RemoteOption { + return nil +} diff --git a/libcontainerd/types.go b/libcontainerd/types.go new file mode 100644 index 0000000..c7ade6b --- /dev/null +++ b/libcontainerd/types.go @@ -0,0 +1,75 @@ +package libcontainerd + +import ( + "io" + + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/net/context" +) + +// State constants used in state change reporting. +const ( + StateStart = "start-container" + StatePause = "pause" + StateResume = "resume" + StateExit = "exit" + StateRestore = "restore" + StateExitProcess = "exit-process" + StateOOM = "oom" // fake state +) + +// CommonStateInfo contains the state info common to all platforms. +type CommonStateInfo struct { // FIXME: event? + State string + Pid uint32 + ExitCode uint32 + ProcessID string +} + +// Backend defines callbacks that the client of the library needs to implement. +type Backend interface { + StateChanged(containerID string, state StateInfo) error +} + +// Client provides access to containerd features. +type Client interface { + GetServerVersion(ctx context.Context) (*ServerVersion, error) + Create(containerID string, checkpoint string, checkpointDir string, spec specs.Spec, attachStdio StdioCallback, options ...CreateOption) error + Signal(containerID string, sig int) error + SignalProcess(containerID string, processFriendlyName string, sig int) error + AddProcess(ctx context.Context, containerID, processFriendlyName string, process Process, attachStdio StdioCallback) (int, error) + Resize(containerID, processFriendlyName string, width, height int) error + Pause(containerID string) error + Resume(containerID string) error + Restore(containerID string, attachStdio StdioCallback, options ...CreateOption) error + Stats(containerID string) (*Stats, error) + GetPidsForContainer(containerID string) ([]int, error) + Summary(containerID string) ([]Summary, error) + UpdateResources(containerID string, resources Resources) error + CreateCheckpoint(containerID string, checkpointID string, checkpointDir string, exit bool) error + DeleteCheckpoint(containerID string, checkpointID string, checkpointDir string) error + ListCheckpoints(containerID string, checkpointDir string) (*Checkpoints, error) +} + +// CreateOption allows to configure parameters of container creation. +type CreateOption interface { + Apply(interface{}) error +} + +// StdioCallback is called to connect a container or process stdio. +type StdioCallback func(IOPipe) error + +// IOPipe contains the stdio streams. +type IOPipe struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser + Terminal bool // Whether stderr is connected on Windows +} + +// ServerVersion contains version information as retrieved from the +// server +type ServerVersion struct { + containerd.GetServerVersionResponse +} diff --git a/libcontainerd/types_linux.go b/libcontainerd/types_linux.go new file mode 100644 index 0000000..4f06358 --- /dev/null +++ b/libcontainerd/types_linux.go @@ -0,0 +1,49 @@ +package libcontainerd + +import ( + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Process contains information to start a specific application inside the container. +type Process struct { + // Terminal creates an interactive terminal for the container. + Terminal bool `json:"terminal"` + // User specifies user information for the process. + User *specs.User `json:"user"` + // Args specifies the binary and arguments for the application to execute. + Args []string `json:"args"` + // Env populates the process environment for the process. + Env []string `json:"env,omitempty"` + // Cwd is the current working directory for the process and must be + // relative to the container's root. + Cwd *string `json:"cwd"` + // Capabilities are linux capabilities that are kept for the container. + Capabilities []string `json:"capabilities,omitempty"` + // Rlimits specifies rlimit options to apply to the process. + Rlimits []specs.LinuxRlimit `json:"rlimits,omitempty"` + // ApparmorProfile specifies the apparmor profile for the container. + ApparmorProfile *string `json:"apparmorProfile,omitempty"` + // SelinuxLabel specifies the selinux context that the container process is run as. + SelinuxLabel *string `json:"selinuxLabel,omitempty"` +} + +// StateInfo contains description about the new state container has entered. +type StateInfo struct { + CommonStateInfo + + // Platform specific StateInfo + OOMKilled bool +} + +// Stats contains a stats properties from containerd. +type Stats containerd.StatsResponse + +// Summary contains a container summary from containerd +type Summary struct{} + +// Resources defines updatable container resource values. +type Resources containerd.UpdateResource + +// Checkpoints contains the details of a checkpoint +type Checkpoints containerd.ListCheckpointResponse diff --git a/libcontainerd/types_solaris.go b/libcontainerd/types_solaris.go new file mode 100644 index 0000000..2ab18eb --- /dev/null +++ b/libcontainerd/types_solaris.go @@ -0,0 +1,43 @@ +package libcontainerd + +import ( + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Process contains information to start a specific application inside the container. +type Process struct { + // Terminal creates an interactive terminal for the container. + Terminal bool `json:"terminal"` + // User specifies user information for the process. + User *specs.User `json:"user"` + // Args specifies the binary and arguments for the application to execute. + Args []string `json:"args"` + // Env populates the process environment for the process. + Env []string `json:"env,omitempty"` + // Cwd is the current working directory for the process and must be + // relative to the container's root. + Cwd *string `json:"cwd"` + // Capabilities are linux capabilities that are kept for the container. + Capabilities []string `json:"capabilities,omitempty"` +} + +// Stats contains a stats properties from containerd. +type Stats struct{} + +// Summary contains a container summary from containerd +type Summary struct{} + +// StateInfo contains description about the new state container has entered. +type StateInfo struct { + CommonStateInfo + + // Platform specific StateInfo + OOMKilled bool +} + +// Resources defines updatable container resource values. +type Resources struct{} + +// Checkpoints contains the details of a checkpoint +type Checkpoints containerd.ListCheckpointResponse diff --git a/libcontainerd/types_windows.go b/libcontainerd/types_windows.go new file mode 100644 index 0000000..317bfb0 --- /dev/null +++ b/libcontainerd/types_windows.go @@ -0,0 +1,79 @@ +package libcontainerd + +import ( + "github.com/Microsoft/hcsshim" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Process contains information to start a specific application inside the container. +type Process specs.Process + +// Summary contains a ProcessList item from HCS to support `top` +type Summary hcsshim.ProcessListItem + +// StateInfo contains description about the new state container has entered. +type StateInfo struct { + CommonStateInfo + + // Platform specific StateInfo + UpdatePending bool // Indicates that there are some update operations pending that should be completed by a servicing container. +} + +// Stats contains statistics from HCS +type Stats hcsshim.Statistics + +// Resources defines updatable container resource values. +type Resources struct{} + +// ServicingOption is a CreateOption with a no-op application that signifies +// the container needs to be used for a Windows servicing operation. +type ServicingOption struct { + IsServicing bool +} + +// FlushOption is a CreateOption that signifies if the container should be +// started with flushes ignored until boot has completed. This is an optimisation +// for first boot of a container. +type FlushOption struct { + IgnoreFlushesDuringBoot bool +} + +// HyperVIsolationOption is a CreateOption that indicates whether the runtime +// should start the container as a Hyper-V container. +type HyperVIsolationOption struct { + IsHyperV bool +} + +// LayerOption is a CreateOption that indicates to the runtime the layer folder +// and layer paths for a container. +type LayerOption struct { + // LayerFolderPath is the path to the current layer folder. Empty for Hyper-V containers. + LayerFolderPath string `json:",omitempty"` + // Layer paths of the parent layers + LayerPaths []string +} + +// NetworkEndpointsOption is a CreateOption that provides the runtime list +// of network endpoints to which a container should be attached during its creation. +type NetworkEndpointsOption struct { + Endpoints []string + AllowUnqualifiedDNSQuery bool + DNSSearchList []string + NetworkSharedContainerID string +} + +// CredentialsOption is a CreateOption that indicates the credentials from +// a credential spec to be used to the runtime +type CredentialsOption struct { + Credentials string +} + +// Checkpoint holds the details of a checkpoint (not supported in windows) +type Checkpoint struct { + Name string +} + +// Checkpoints contains the details of a checkpoint +type Checkpoints struct { + Checkpoints []*Checkpoint +} diff --git a/libcontainerd/utils_linux.go b/libcontainerd/utils_linux.go new file mode 100644 index 0000000..5fd5bf6 --- /dev/null +++ b/libcontainerd/utils_linux.go @@ -0,0 +1,62 @@ +package libcontainerd + +import ( + "syscall" + + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func getRootIDs(s specs.Spec) (int, int, error) { + var hasUserns bool + for _, ns := range s.Linux.Namespaces { + if ns.Type == specs.UserNamespace { + hasUserns = true + break + } + } + if !hasUserns { + return 0, 0, nil + } + uid := hostIDFromMap(0, s.Linux.UIDMappings) + gid := hostIDFromMap(0, s.Linux.GIDMappings) + return uid, gid, nil +} + +func hostIDFromMap(id uint32, mp []specs.LinuxIDMapping) int { + for _, m := range mp { + if id >= m.ContainerID && id <= m.ContainerID+m.Size-1 { + return int(m.HostID + id - m.ContainerID) + } + } + return 0 +} + +func systemPid(ctr *containerd.Container) uint32 { + var pid uint32 + for _, p := range ctr.Processes { + if p.Pid == InitFriendlyName { + pid = p.SystemPid + } + } + return pid +} + +func convertRlimits(sr []specs.LinuxRlimit) (cr []*containerd.Rlimit) { + for _, r := range sr { + cr = append(cr, &containerd.Rlimit{ + Type: r.Type, + Hard: r.Hard, + Soft: r.Soft, + }) + } + return +} + +// setPDeathSig sets the parent death signal to SIGKILL +func setSysProcAttr(sid bool) *syscall.SysProcAttr { + return &syscall.SysProcAttr{ + Setsid: sid, + Pdeathsig: syscall.SIGKILL, + } +} diff --git a/libcontainerd/utils_solaris.go b/libcontainerd/utils_solaris.go new file mode 100644 index 0000000..10ae599 --- /dev/null +++ b/libcontainerd/utils_solaris.go @@ -0,0 +1,27 @@ +package libcontainerd + +import ( + "syscall" + + containerd "github.com/containerd/containerd/api/grpc/types" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func getRootIDs(s specs.Spec) (int, int, error) { + return 0, 0, nil +} + +func systemPid(ctr *containerd.Container) uint32 { + var pid uint32 + for _, p := range ctr.Processes { + if p.Pid == InitFriendlyName { + pid = p.SystemPid + } + } + return pid +} + +// setPDeathSig sets the parent death signal to SIGKILL +func setSysProcAttr(sid bool) *syscall.SysProcAttr { + return nil +} diff --git a/libcontainerd/utils_windows.go b/libcontainerd/utils_windows.go new file mode 100644 index 0000000..41ac40d --- /dev/null +++ b/libcontainerd/utils_windows.go @@ -0,0 +1,46 @@ +package libcontainerd + +import "strings" + +// setupEnvironmentVariables converts a string array of environment variables +// into a map as required by the HCS. Source array is in format [v1=k1] [v2=k2] etc. +func setupEnvironmentVariables(a []string) map[string]string { + r := make(map[string]string) + for _, s := range a { + arr := strings.SplitN(s, "=", 2) + if len(arr) == 2 { + r[arr[0]] = arr[1] + } + } + return r +} + +// Apply for a servicing option is a no-op. +func (s *ServicingOption) Apply(interface{}) error { + return nil +} + +// Apply for the flush option is a no-op. +func (f *FlushOption) Apply(interface{}) error { + return nil +} + +// Apply for the hypervisolation option is a no-op. +func (h *HyperVIsolationOption) Apply(interface{}) error { + return nil +} + +// Apply for the layer option is a no-op. +func (h *LayerOption) Apply(interface{}) error { + return nil +} + +// Apply for the network endpoints option is a no-op. +func (s *NetworkEndpointsOption) Apply(interface{}) error { + return nil +} + +// Apply for the credentials option is a no-op. +func (s *CredentialsOption) Apply(interface{}) error { + return nil +} diff --git a/libcontainerd/utils_windows_test.go b/libcontainerd/utils_windows_test.go new file mode 100644 index 0000000..f3679bf --- /dev/null +++ b/libcontainerd/utils_windows_test.go @@ -0,0 +1,13 @@ +package libcontainerd + +import ( + "testing" +) + +func TestEnvironmentParsing(t *testing.T) { + env := []string{"foo=bar", "car=hat", "a=b=c"} + result := setupEnvironmentVariables(env) + if len(result) != 3 || result["foo"] != "bar" || result["car"] != "hat" || result["a"] != "b=c" { + t.Fatalf("Expected map[foo:bar car:hat a:b=c], got %v", result) + } +} |