diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 8b4ef8848e..7b6af3c9e9 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -100,14 +100,6 @@ type Stack interface { // Restore restarts the network stack after restore. Restore() - // ReplaceConfig replaces the new network stack configuration to the - // loaded or saved network stack after restore. - // TODO(b/379115439): This method is a workaround to update netstack config - // during restore. It should be removed after a new method is added to - // extract the complete config from the spec and update it in the loaded - // stack during restore. - ReplaceConfig(st Stack) - // Destroy the network stack. Destroy() diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go index 0ffeee03d6..111b9112d2 100644 --- a/pkg/sentry/inet/namespace.go +++ b/pkg/sentry/inet/namespace.go @@ -36,7 +36,7 @@ type Namespace struct { // // At afterLoad(), creator will be used to create network stack. Stateify // needs to wait for this field to be loaded before calling afterLoad(). - creator NetworkStackCreator `state:"wait"` + creator NetworkStackCreator `state:"nosave"` // isRoot indicates whether this is the root network namespace. isRoot bool diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index d9e0a9c421..3fb054ad25 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -836,14 +836,6 @@ func (k *Kernel) LoadFrom(ctx context.Context, r, pagesMetadata io.Reader, pages if saveRestoreNet { log.Infof("netstack save restore is enabled") - s := k.rootNetworkNamespace.Stack() - if s == nil { - panic("inet.Stack cannot be nil when netstack s/r is enabled") - } - if net != nil { - s.ReplaceConfig(net) - } - s.Restore() } else if net != nil { net.Restore() } diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index 4d1facb0af..cc9f35f88e 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -398,9 +398,6 @@ func (*Stack) Pause() {} // Restore implements inet.Stack.Restore. func (*Stack) Restore() {} -// ReplaceConfig implements inet.Stack.ReplaceConfig. -func (s *Stack) ReplaceConfig(_ inet.Stack) {} - // Resume implements inet.Stack.Resume. func (*Stack) Resume() {} diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index fc21771bd3..17927bcbb3 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" @@ -922,15 +923,8 @@ func (s *Stack) Pause() { // Restore implements inet.Stack.Restore. func (s *Stack) Restore() { - s.Stack.Restore() -} - -// ReplaceConfig implements inet.Stack.ReplaceConfig. -func (s *Stack) ReplaceConfig(st inet.Stack) { - if _, ok := st.(*Stack); !ok { - panic("netstack.Stack cannot be nil when netstack s/r is enabled") - } - s.Stack.ReplaceConfig(st.(*Stack).Stack) + defaultIPTables := netfilter.DefaultLinuxTables + s.Stack.Restore(defaultIPTables) } // Resume implements inet.Stack.Resume. diff --git a/pkg/tcpip/stack/save_restore.go b/pkg/tcpip/stack/save_restore.go index 838cf5f4fd..58961ba027 100644 --- a/pkg/tcpip/stack/save_restore.go +++ b/pkg/tcpip/stack/save_restore.go @@ -20,10 +20,14 @@ import ( "time" cryptorand "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/tcpip" ) // afterLoad is invoked by stateify. func (s *Stack) afterLoad(context.Context) { s.insecureRNG = rand.New(rand.NewSource(time.Now().UnixNano())) s.secureRNG = cryptorand.RNGFrom(cryptorand.Reader) + s.mu.Lock() + s.nics = make(map[tcpip.NICID]*nic) + s.mu.Unlock() } diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 472b6d4b03..66b9cd9028 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -1966,45 +1966,17 @@ func (s *Stack) Pause() { } } -func (s *Stack) getNICs() map[tcpip.NICID]*nic { - s.mu.RLock() - defer s.mu.RUnlock() - - nics := s.nics - return nics -} - -// ReplaceConfig replaces config in the loaded stack. -func (s *Stack) ReplaceConfig(st *Stack) { - if st == nil { - panic("stack.Stack cannot be nil when netstack s/r is enabled") - } - - // Update route table. - s.SetRouteTable(st.GetRouteTable()) - - // Update NICs. - nics := st.getNICs() - s.mu.Lock() - defer s.mu.Unlock() - s.nics = make(map[tcpip.NICID]*nic) - for id, nic := range nics { - nic.stack = s - s.nics[id] = nic - _ = s.NextNICID() - } - s.tables = st.tables -} - // Restore restarts the stack after a restore. This must be called after the // entire system has been restored. -func (s *Stack) Restore() { +func (s *Stack) Restore(defaultIPTables func(clock tcpip.Clock, rand *rand.Rand) *IPTables) { // RestoredEndpoint.Restore() may call other methods on s, so we can't hold // s.mu while restoring the endpoints. s.mu.Lock() eps := s.restoredEndpoints s.restoredEndpoints = nil saveRestoreEnabled := s.saveRestoreEnabled + s.icmpRateLimiter = NewICMPRateLimiter(s.clock) + s.tables = defaultIPTables(s.clock, s.insecureRNG) s.mu.Unlock() for _, e := range eps { e.Restore(s) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 7de9164f79..eda0747b24 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -20,11 +20,13 @@ go_library( "loader.go", "mount_hints.go", "network.go", + "no_xdp.go", "restore.go", "restore_impl.go", "seccheck.go", "strace.go", "vfs.go", + "xdp.go", ], visibility = [ "//pkg/test:__subpackages__", @@ -125,17 +127,23 @@ go_library( "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/urpc", + "//pkg/xdp", "//runsc/boot/filter", "//runsc/boot/portforward", "//runsc/boot/pprof", "//runsc/boot/procfs", "//runsc/config", "//runsc/profile", + "//runsc/sandbox/bpf", "//runsc/specutils", "//runsc/specutils/seccomp", "//runsc/version", + "//tools/xdp/cmd", + "@com_github_cilium_ebpf//:go_default_library", + "@com_github_cilium_ebpf//link:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", + "@com_github_vishvananda_netlink//:go_default_library", "@org_golang_google_protobuf//proto:go_default_library", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 217280b1e0..6c7f4989ab 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -120,6 +120,10 @@ const ( // ContMgrContainerRuntimeState returns the runtime state of a container. ContMgrContainerRuntimeState = "containerManager.ContainerRuntimeState" + + // ContMgrStoreNetworkConfig stores the network config which are required + // during restore in the loader. + ContMgrStoreNetworkConfig = "containerManager.StoreNetworkConfig" ) const ( @@ -131,6 +135,9 @@ const ( // DebugStacks collects sandbox stacks for debugging. DebugStacks = "debug.Stacks" + + // NetworkSetupNetwork sets up network stack. + NetworkSetupNetwork = "Network.SetupNetwork" ) // Profiling related commands (see pprof.go for more details). @@ -943,3 +950,9 @@ func (cm *containerManager) ContainerRuntimeState(cid *string, state *ContainerR *state = cm.l.containerRuntimeState(*cid) return nil } + +// StoreNetworkConfig stores the network config in the loader. +func (cm *containerManager) StoreNetworkConfig(netConf *NetworkConfig, _ *struct{}) error { + cm.l.netConf = netConf + return nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 4ae3b787e5..56af9c0c17 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -19,6 +19,7 @@ import ( "encoding/json" "errors" "fmt" + "net" "os" "runtime" "strconv" @@ -255,6 +256,18 @@ type Loader struct { // saveRestoreNet indicates if the saved network stack should be used // during restore. saveRestoreNet bool + + // netConf contains the network configuration required during restore. + netConf *NetworkConfig +} + +// NetworkConfig contains the network config. +type NetworkConfig struct { + Args *CreateLinksAndRoutesArgs + InitArgs *InitPluginStackArgs + Iface net.Interface + Network config.NetworkType + XDPMode config.XDPMode } // execID uniquely identifies a sentry process that is executed in a container. diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 19bf6d6f71..3506b5b81e 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -605,3 +605,39 @@ func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask { addr := ipToAddress(net.IP(ipMask)) return tcpip.MaskFromBytes(addr.AsSlice()) } + +func (n *Network) SetupNetworkSandbox(netConf *NetworkConfig, _ *struct{}) error { + switch netConf.XDPMode { + case config.XDPModeOff: + case config.XDPModeNS: + case config.XDPModeRedirect: + if err := n.SetupXDPModeRedirect(netConf, nil); err != nil { + return fmt.Errorf("failed to create XDP tunnel interface: %w", err) + } + return nil + case config.XDPModeTunnel: + if err := n.SetupXDPModeTunnel(netConf, nil); err != nil { + return fmt.Errorf("failed to create XDP tunnel interface: %w", err) + } + return nil + default: + return fmt.Errorf("unknown XDP mode: %v", netConf.XDPMode) + } + return n.CreateLinksAndRoutes(netConf.Args, nil) +} + +func (n *Network) SetupNetwork(netConf *NetworkConfig, _ *struct{}) error { + switch netConf.Network { + case config.NetworkNone: + return n.CreateLinksAndRoutes(netConf.Args, nil) + case config.NetworkHost: + /* nothing to do */ + return nil + case config.NetworkPlugin: + return n.InitPluginStack(netConf.InitArgs, nil) + case config.NetworkSandbox: + return n.SetupNetworkSandbox(netConf, nil) + default: + return fmt.Errorf("unknown network type") + } +} diff --git a/runsc/boot/no_xdp.go b/runsc/boot/no_xdp.go new file mode 100644 index 0000000000..125786f78f --- /dev/null +++ b/runsc/boot/no_xdp.go @@ -0,0 +1,42 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !xdp +// +build !xdp + +package boot + +import ( + "errors" + "net" + "os" +) + +// This file holds placeholders for XDP support, which is not compiled in by default. +// +// To enable XDP support, build gVisor with `--define=gotags=xdp`. + +const noXDPMsg = "XDP support was not built into this release -- rebuild with --define=gotags=xdp" + +func (n *Network) SetupXDPModeRedirect(netConf *NetworkConfig, _ *struct{}) error { + return errors.New(noXDPMsg) +} + +func createSocketXDP(iface net.Interface) ([]*os.File, error) { + return nil, errors.New(noXDPMsg) +} + +func (n *Network) SetupXDPModeTunnel(netConf *NetworkConfig, _ *struct{}) error { + return errors.New(noXDPMsg) +} diff --git a/runsc/boot/restore.go b/runsc/boot/restore.go index 1a40804c05..9db33448b2 100644 --- a/runsc/boot/restore.go +++ b/runsc/boot/restore.go @@ -708,12 +708,32 @@ func (r *restorer) restore(l *Loader, unsafeSkipRestoreSpecValidation bool) erro l.kernelInitExtra() + if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { + // The network stack will be loaded from the state file, we do + // not need this network stack anymore. + oldInetStack.Destroy() + + n := &Network{ + Stack: eps.Stack, + } + log.Infof("network config: %+v", l.netConf) + if err := n.SetupNetwork(l.netConf, nil); err != nil { + return fmt.Errorf("restore network error: %w", err) + } + } else { + l.k.RootNetworkNamespace().RestoreRootStack(hostinet.NewStack()) + } + // Refresh the control server with the newly created kernel. l.ctrl.refreshHandlers() // Release `l.mu` before calling into callbacks. cu.Clean() + if _, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { + l.k.RootNetworkNamespace().Stack().Restore() + } + // r.restoreDone() signals and waits for the sandbox to start. if err := r.restoreDone(); err != nil { return fmt.Errorf("restorer.restoreDone callback failed: %w", err) diff --git a/runsc/boot/xdp.go b/runsc/boot/xdp.go new file mode 100644 index 0000000000..d9e0f3b83e --- /dev/null +++ b/runsc/boot/xdp.go @@ -0,0 +1,314 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build xdp +// +build xdp + +package boot + +import ( + "bytes" + "fmt" + "net" + "os" + "strings" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/link" + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/pkg/xdp" + "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/sandbox/bpf" + xdpcmd "gvisor.dev/gvisor/tools/xdp/cmd" +) + +func setupXDPModeRedirectArgs(netConf *NetworkConfig) error { + // Create an XDP socket. The sentry will mmap the rings. + xdpSockFD, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) + if err != nil { + return fmt.Errorf("unable to create AF_XDP socket: %w", err) + } + xdpSock := os.NewFile(uintptr(xdpSockFD), "xdp-sock-fd") + + // Dup to ensure os.File doesn't close it prematurely. + if _, err := unix.Dup(xdpSockFD); err != nil { + return fmt.Errorf("failed to dup XDP sock: %w", err) + } + netConf.args.FilePayload.Files = append(args.FilePayload.Files, xdpSock) + + if err := pcapAndNAT(netConf.args, conf); err != nil { + return err + } + + log.Infof("Setting up network, config: %+v", netConf.args) + return nil +} + +func setupXDPModeRedirectInterface(netConf *NetworkConfig) error { + iface := netConf.iface + // Insert socket into eBPF map. Note that sockets are automatically + // removed from eBPF maps when released. See net/xdp/xsk.c:xsk_release + // and net/xdp/xsk.c:xsk_delete_from_maps. + mapPath := xdpcmd.RedirectMapPath(iface.Name) + pinnedMap, err := ebpf.LoadPinnedMap(mapPath, nil) + if err != nil { + return fmt.Errorf("failed to load pinned map %s: %w", mapPath, err) + } + // TODO(b/240191988): Updating of pinned maps should be sychronized and + // check for the existence of the key. + mapKey := uint32(0) + mapVal := uint32(xdpSockFD) + if err := pinnedMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { + return fmt.Errorf("failed to insert socket into map %s: %w", mapPath, err) + } + + // Bind to the device. + // TODO(b/240191988): We can't assume there's only one queue, but this + // appears to be the case on gVNIC instances. + if err := xdp.Bind(xdpSockFD, uint32(iface.Index), 0 /* queueID */, true /*conf.AFXDPUseNeedWakeup*/); err != nil { + return fmt.Errorf("failed to bind to interface %q: %v", iface.Name, err) + } + + return nil +} + +func setupXDPModeTunnel(netConf *NetworkConfig) error { + args := netConf.args + // Setup the XDP socket on the gVisor nic. + files, err := func() ([]*os.File, error) { + // Join the network namespace that we will be copying. + restore, err := joinNetNS(nsPath) + if err != nil { + return nil, err + } + defer restore() + + // Create an XDP socket. The sentry will mmap memory for the various + // rings and bind to the device. + fd, err := unix.Socket(unix.AF_XDP, unix.SOCK_RAW, 0) + if err != nil { + return nil, fmt.Errorf("unable to create AF_XDP socket: %v", err) + } + + // We also need to, before dropping privileges, attach a program to the + // device and insert our socket into its map. + + // Load into the kernel. + spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.AFXDPProgram)) + if err != nil { + return nil, fmt.Errorf("failed to load spec: %v", err) + } + + var objects struct { + Program *ebpf.Program `ebpf:"xdp_prog"` + SockMap *ebpf.Map `ebpf:"sock_map"` + } + if err := spec.LoadAndAssign(&objects, nil); err != nil { + return nil, fmt.Errorf("failed to load program: %v", err) + } + + // We assume there are two interfaces in the netns: a loopback and veth. + ifaces, err := net.Interfaces() + if err != nil { + return nil, fmt.Errorf("querying interfaces in ns: %w", err) + } + + var iface *net.Interface + for _, netIface := range ifaces { + if netIface.Flags&net.FlagLoopback == 0 { + iface = &netIface + break + } + } + if iface == nil { + return nil, fmt.Errorf("unable to find non-loopback interface in the ns") + } + args.XDPLinks[0].InterfaceIndex = iface.Index + + rawLink, err := link.AttachRawLink(link.RawLinkOptions{ + Program: objects.Program, + Attach: ebpf.AttachXDP, + Target: iface.Index, + // By not setting the Flag field, the kernel will choose the + // fastest mode. In order those are: + // - Offloaded onto the NIC. + // - Running directly in the driver. + // - Generic mode, which works with any NIC/driver but lacks + // much of the XDP performance boost. + }) + if err != nil { + return nil, fmt.Errorf("failed to attach BPF program to interface %q: %v", iface.Name, err) + } + + // Insert our AF_XDP socket into the BPF map that dictates where + // packets are redirected to. + // TODO(b/240191988): Updating of pinned maps should be + // sychronized and check for the existence of the key. + key := uint32(0) + val := uint32(fd) + if err := objects.SockMap.Update(&key, &val, 0 /* flags */); err != nil { + return nil, fmt.Errorf("failed to insert socket into BPF map: %v", err) + } + + // We need to keep the Program, SockMap, and link FDs open until they + // can be passed to the sandbox process. + progFD, err := unix.Dup(objects.Program.FD()) + if err != nil { + return nil, fmt.Errorf("failed to dup BPF program: %v", err) + } + sockMapFD, err := unix.Dup(objects.SockMap.FD()) + if err != nil { + return nil, fmt.Errorf("failed to dup BPF map: %v", err) + } + linkFD, err := unix.Dup(rawLink.FD()) + if err != nil { + return nil, fmt.Errorf("failed to dup BPF link: %v", err) + } + + return []*os.File{ + os.NewFile(uintptr(fd), "xdp-fd"), // The socket. + os.NewFile(uintptr(progFD), "program-fd"), // The XDP program. + os.NewFile(uintptr(sockMapFD), "sockmap-fd"), // The XDP map. + os.NewFile(uintptr(linkFD), "link-fd"), // The XDP link. + }, nil + }() + if err != nil { + return fmt.Errorf("failed to create AF_XDP socket for container: %w", err) + } + args.FilePayload.Files = append(args.FilePayload.Files, files...) + + // We're back in the parent netns. Get all interfaces. + ifaces, err := net.Interfaces() + if err != nil { + return fmt.Errorf("querying interfaces: %w", err) + } + + // TODO(b/240191988): Find a better way to identify the other end of the veth. + var vethIface *net.Interface + for _, iface := range ifaces { + if strings.HasPrefix(iface.Name, "veth") { + vethIface = &iface + break + } + } + if vethIface == nil { + return fmt.Errorf("unable to find veth interface") + } + + // Insert veth into host eBPF map. + hostMapPath := xdpcmd.TunnelHostMapPath(hostIface.Name) + pinnedHostMap, err := ebpf.LoadPinnedMap(hostMapPath, nil) + if err != nil { + return fmt.Errorf("failed to load pinned host map %s: %w", hostMapPath, err) + } + // TODO(b/240191988): Updating of pinned maps should be sychronized and + // check for the existence of the key. + mapKey := uint32(0) + mapVal := uint32(vethIface.Index) + if err := pinnedHostMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { + return fmt.Errorf("failed to insert veth into host map %s: %w", hostMapPath, err) + } + + // Attach a program to the veth. + spec, err := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(bpf.TunnelVethProgram)) + if err != nil { + return fmt.Errorf("failed to load spec: %v", err) + } + + var objects struct { + Program *ebpf.Program `ebpf:"xdp_veth_prog"` + DevMap *ebpf.Map `ebpf:"dev_map"` + } + if err := spec.LoadAndAssign(&objects, nil); err != nil { + return fmt.Errorf("failed to load program: %v", err) + } + defer func() { + if err := objects.Program.Close(); err != nil { + log.Infof("failed to close program: %v", err) + } + if err := objects.DevMap.Close(); err != nil { + log.Infof("failed to close sock map: %v", err) + } + }() + + attached, err := link.AttachXDP(link.XDPOptions{ + Program: objects.Program, + Interface: vethIface.Index, + // By not setting the Flag field, the kernel will choose the + // fastest mode. In order those are: + // - Offloaded onto the NIC. + // - Running directly in the driver. + // - Generic mode, which works with any NIC/driver but lacks + // much of the XDP performance boost. + }) + if err != nil { + return fmt.Errorf("failed to attach: %w", err) + } + + var ( + vethPinDir = xdpcmd.RedirectPinDir(vethIface.Name) + vethMapPath = xdpcmd.TunnelVethMapPath(vethIface.Name) + vethProgramPath = xdpcmd.TunnelVethProgramPath(vethIface.Name) + vethLinkPath = xdpcmd.TunnelVethLinkPath(vethIface.Name) + ) + + // Create directory /sys/fs/bpf//. + if err := os.Mkdir(vethPinDir, 0700); err != nil && !os.IsExist(err) { + return fmt.Errorf("failed to create directory for pinning at %s: %v", vethPinDir, err) + } + + // Pin the map at /sys/fs/bpf//tunnel_host_map. + if err := objects.DevMap.Pin(vethMapPath); err != nil { + return fmt.Errorf("failed to pin map at %s", vethMapPath) + } + log.Infof("Pinned map at %s", vethMapPath) + + // Pin the program at /sys/fs/bpf//tunnel_host_program. + if err := objects.Program.Pin(vethProgramPath); err != nil { + return fmt.Errorf("failed to pin program at %s", vethProgramPath) + } + log.Infof("Pinned program at %s", vethProgramPath) + + // Make everything persistent by pinning the link. Otherwise, the XDP + // program would detach when this process exits. + if err := attached.Pin(vethLinkPath); err != nil { + return fmt.Errorf("failed to pin link at %s", vethLinkPath) + } + log.Infof("Pinned link at %s", vethLinkPath) + + // Insert host into veth eBPF map. + // TODO(b/240191988): We should be able to use the existing map instead + // of opening a pinned copy. + pinnedVethMap, err := ebpf.LoadPinnedMap(vethMapPath, nil) + if err != nil { + return fmt.Errorf("failed to load pinned veth map %s: %w", vethMapPath, err) + } + // TODO(b/240191988): Updating of pinned maps should be sychronized and + // check for the existence of the key. + mapKey = uint32(0) + mapVal = uint32(hostIface.Index) + if err := pinnedVethMap.Update(&mapKey, &mapVal, ebpf.UpdateAny); err != nil { + return fmt.Errorf("failed to insert host into veth map %s: %w", vethMapPath, err) + } + + if err := pcapAndNAT(&args, conf); err != nil { + return err + } + + log.Debugf("Setting up network, config: %+v", args) + return nil +} diff --git a/runsc/config/flags.go b/runsc/config/flags.go index c4149c5169..b4f40226f6 100644 --- a/runsc/config/flags.go +++ b/runsc/config/flags.go @@ -162,7 +162,7 @@ func RegisterFlags(flagSet *flag.FlagSet) { flagSet.Bool("TESTONLY-afs-syscall-panic", false, "TEST ONLY; do not ever use! Used for tests exercising gVisor panic reporting.") flagSet.String("TESTONLY-autosave-image-path", "", "TEST ONLY; enable auto save for syscall tests and set path for state file.") flagSet.Bool("TESTONLY-autosave-resume", false, "TEST ONLY; enable auto save and resume for syscall tests and set path for state file.") - flagSet.Bool("TESTONLY-save-restore-netstack", false, "TEST ONLY; enable save/restore for netstack.") + flagSet.Bool("TESTONLY-save-restore-netstack", true, "TEST ONLY; enable save/restore for netstack.") } // overrideAllowlist lists all flags that can be changed using OCI diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 7d4a3b5c67..5a07ca4982 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -55,119 +55,115 @@ import ( func setupNetwork(conn *urpc.Client, pid int, conf *config.Config) error { log.Infof("Setting up network") + if conf.Network == config.NetworkHost { + // Nothing to do here. + return nil + } + + netConf, err := getNetworkConfArgs(pid, conf) + if err != nil { + return fmt.Errorf("getNetworkConfArgs failed with error: %v", err) + } + if err := conn.Call(boot.NetworkSetupNetwork, &netConf, nil); err != nil { + return fmt.Errorf("setup network failed with error: %v", err) + } + return nil +} + +func getNetworkConfArgs(pid int, conf *config.Config) (boot.NetworkConfig, error) { + var netConf boot.NetworkConfig switch conf.Network { case config.NetworkNone: log.Infof("Network is disabled, create loopback interface only") - if err := createDefaultLoopbackInterface(conf, conn); err != nil { - return fmt.Errorf("creating default loopback interface: %v", err) - } + return getLoopbackArgs(conf), nil case config.NetworkSandbox: // Build the path to the net namespace of the sandbox process. // This is what we will copy. nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net") - if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf); err != nil { - return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err) - } - case config.NetworkHost: - // Nothing to do here. + return getNSArgs(nsPath, conf) case config.NetworkPlugin: - if err := initPluginStack(conn, pid, conf); err != nil { - return fmt.Errorf("failed to initialize external stack, error: %v", err) - } + return getPluginArgs(pid, conf) default: - return fmt.Errorf("invalid network type: %v", conf.Network) + return netConf, fmt.Errorf("invalid network type: %v", conf.Network) } - return nil } -func createDefaultLoopbackInterface(conf *config.Config, conn *urpc.Client) error { +func getLoopbackArgs(conf *config.Config) boot.NetworkConfig { link := boot.DefaultLoopbackLink link.GVisorGRO = conf.GVisorGRO - if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ - LoopbackLinks: []boot.LoopbackLink{link}, - DisconnectOk: conf.NetDisconnectOk, - }, nil); err != nil { - return fmt.Errorf("creating loopback link and routes: %v", err) + netConf := boot.NetworkConfig{ + Args: &boot.CreateLinksAndRoutesArgs{ + LoopbackLinks: []boot.LoopbackLink{link}, + DisconnectOk: conf.NetDisconnectOk, + }, } - return nil + return netConf } -func joinNetNS(nsPath string) (func(), error) { - runtime.LockOSThread() - restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ - Type: specs.NetworkNamespace, - Path: nsPath, - }) - if err != nil { - runtime.UnlockOSThread() - return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) +func getPluginArgs(pid int, conf *config.Config) (boot.NetworkConfig, error) { + pluginStack := plugin.GetPluginStack() + if pluginStack == nil { + return boot.NetworkConfig{}, fmt.Errorf("plugin stack is not registered") } - return func() { - restoreNS() - runtime.UnlockOSThread() - }, nil -} -// isRootNetNS determines whether we are running in the root net namespace. -// /proc/sys/net/core/dev_weight only exists in root network namespace. -func isRootNetNS() (bool, error) { - err := unix.Access("/proc/sys/net/core/dev_weight", unix.F_OK) - switch err { - case nil: - return true, nil - case unix.ENOENT: - return false, nil - default: - return false, fmt.Errorf("failed to access /proc/sys/net/core/dev_weight: %v", err) + initStr, fds, err := pluginStack.PreInit(&plugin.PreInitStackArgs{Pid: pid}) + if err != nil { + return boot.NetworkConfig{}, fmt.Errorf("plugin stack PreInit failed: %v", err) } + var args boot.InitPluginStackArgs + args.InitStr = initStr + for _, fd := range fds { + args.FilePayload.Files = append(args.FilePayload.Files, os.NewFile(uintptr(fd), "")) + } + netConf := boot.NetworkConfig{ + InitArgs: &args, + } + return netConf, nil } -// createInterfacesAndRoutesFromNS scrapes the interface and routes from the -// net namespace with the given path, creates them in the sandbox, and removes -// them from the host. -func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *config.Config) error { +func getNSArgs(nsPath string, conf *config.Config) (boot.NetworkConfig, error) { switch conf.XDP.Mode { case config.XDPModeOff: case config.XDPModeNS: - case config.XDPModeRedirect: - if err := createRedirectInterfacesAndRoutes(conn, conf); err != nil { - return fmt.Errorf("failed to create XDP redirect interface: %w", err) + case config.XDPModeRedirect, config.XDPModeTunnel: + args, iface, err := prepareRedirectInterfaceArgs(boot.BindRunsc, conf) + if err != nil { + return boot.NetworkConfig{}, fmt.Errorf("failed to create XDP redirect interface: %w", err) } - return nil - case config.XDPModeTunnel: - if err := createXDPTunnel(conn, nsPath, conf); err != nil { - return fmt.Errorf("failed to create XDP tunnel: %w", err) + netConf := boot.NetworkConfig{ + Args: &args, + Iface: iface, } - return nil + return netConf, nil default: - return fmt.Errorf("unknown XDP mode: %v", conf.XDP.Mode) + return boot.NetworkConfig{}, fmt.Errorf("unknown XDP mode: %v", conf.XDP.Mode) } + args := boot.CreateLinksAndRoutesArgs{} + netConf := boot.NetworkConfig{} // Join the network namespace that we will be copying. restore, err := joinNetNS(nsPath) if err != nil { - return err + return netConf, err } defer restore() // Get all interfaces in the namespace. ifaces, err := net.Interfaces() if err != nil { - return fmt.Errorf("querying interfaces: %w", err) + return netConf, fmt.Errorf("querying interfaces: %w", err) } isRoot, err := isRootNetNS() if err != nil { - return err + return netConf, err } if isRoot { - return fmt.Errorf("cannot run with network enabled in root network namespace") + return netConf, fmt.Errorf("cannot run with network enabled in root network namespace") } // Collect addresses and routes from the interfaces. - args := boot.CreateLinksAndRoutesArgs{ - DisconnectOk: conf.NetDisconnectOk, - } + args.DisconnectOk = conf.NetDisconnectOk for _, iface := range ifaces { if iface.Flags&net.FlagUp == 0 { log.Infof("Skipping down interface: %+v", iface) @@ -176,14 +172,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con allAddrs, err := iface.Addrs() if err != nil { - return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) + return netConf, fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err) } // We build our own loopback device. if iface.Flags&net.FlagLoopback != 0 { link, err := loopbackLink(conf, iface, allAddrs) if err != nil { - return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) + return netConf, fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err) } args.LoopbackLinks = append(args.LoopbackLinks, link) continue @@ -193,7 +189,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con for _, ifaddr := range allAddrs { ipNet, ok := ifaddr.(*net.IPNet) if !ok { - return fmt.Errorf("address is not IPNet: %+v", ifaddr) + return netConf, fmt.Errorf("address is not IPNet: %+v", ifaddr) } ipAddrs = append(ipAddrs, ipNet) } @@ -205,7 +201,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con // Collect data from the ARP table. dump, err := netlink.NeighList(iface.Index, 0) if err != nil { - return fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) + return netConf, fmt.Errorf("fetching ARP table for %q: %w", iface.Name, err) } var neighbors []boot.Neighbor @@ -223,11 +219,11 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con // will remove the routes as well. routes, defv4, defv6, err := routesForIface(iface) if err != nil { - return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) + return netConf, fmt.Errorf("getting routes for interface %q: %v", iface.Name, err) } if defv4 != nil { if !args.Defaultv4Gateway.Route.Empty() { - return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) + return netConf, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway) } args.Defaultv4Gateway.Route = *defv4 args.Defaultv4Gateway.Name = iface.Name @@ -235,7 +231,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con if defv6 != nil { if !args.Defaultv6Gateway.Route.Empty() { - return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) + return netConf, fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway) } args.Defaultv6Gateway.Route = *defv6 args.Defaultv6Gateway.Name = iface.Name @@ -244,7 +240,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con // Get the link for the interface. ifaceLink, err := netlink.LinkByName(iface.Name) if err != nil { - return fmt.Errorf("getting link for interface %q: %w", iface.Name, err) + return netConf, fmt.Errorf("getting link for interface %q: %w", iface.Name, err) } linkAddress := ifaceLink.Attrs().HardwareAddr @@ -260,18 +256,18 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con // If we encounter an error while deleting the ip, // verify the ip is still present on the interface. if present, err := isAddressOnInterface(iface.Name, addr); err != nil { - return fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err) + return netConf, fmt.Errorf("checking if address %v is on interface %q: %w", addr, iface.Name, err) } else if !present { continue } - return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) + return netConf, fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err) } } if conf.XDP.Mode == config.XDPModeNS { xdpSockFDs, err := createSocketXDP(iface) if err != nil { - return fmt.Errorf("failed to create XDP socket: %v", err) + return netConf, fmt.Errorf("failed to create XDP socket: %v", err) } args.FilePayload.Files = append(args.FilePayload.Files, xdpSockFDs...) args.XDPLinks = append(args.XDPLinks, boot.XDPLink{ @@ -308,13 +304,13 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con log.Debugf("Creating Channel %d", i) socketEntry, err := createSocket(iface, ifaceLink, conf.HostGSO) if err != nil { - return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) + return netConf, fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err) } if i == 0 { link.GSOMaxSize = socketEntry.gsoMaxSize } else { if link.GSOMaxSize != socketEntry.gsoMaxSize { - return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", + return netConf, fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s", link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name) } } @@ -333,38 +329,40 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, conf *con } if err := pcapAndNAT(&args, conf); err != nil { - return err + return netConf, err } - - log.Debugf("Setting up network, config: %+v", args) - if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil { - return fmt.Errorf("creating links and routes: %w", err) - } - return nil + netConf.Args = &args + return netConf, nil } -func initPluginStack(conn *urpc.Client, pid int, conf *config.Config) error { - pluginStack := plugin.GetPluginStack() - if pluginStack == nil { - return fmt.Errorf("plugin stack is not registered") - } - - initStr, fds, err := pluginStack.PreInit(&plugin.PreInitStackArgs{Pid: pid}) +func joinNetNS(nsPath string) (func(), error) { + runtime.LockOSThread() + restoreNS, err := specutils.ApplyNS(specs.LinuxNamespace{ + Type: specs.NetworkNamespace, + Path: nsPath, + }) if err != nil { - return fmt.Errorf("plugin stack PreInit failed: %v", err) - } - var args boot.InitPluginStackArgs - args.InitStr = initStr - for _, fd := range fds { - args.FilePayload.Files = append(args.FilePayload.Files, os.NewFile(uintptr(fd), "")) + runtime.UnlockOSThread() + return nil, fmt.Errorf("joining net namespace %q: %v", nsPath, err) } + return func() { + restoreNS() + runtime.UnlockOSThread() + }, nil +} - log.Debugf("Initializing plugin network stack, config: %+v", args) - if err := conn.Call(boot.NetworkInitPluginStack, &args, nil); err != nil { - return fmt.Errorf("error initializing plugin netstack: %v", err) +// isRootNetNS determines whether we are running in the root net namespace. +// /proc/sys/net/core/dev_weight only exists in root network namespace. +func isRootNetNS() (bool, error) { + err := unix.Access("/proc/sys/net/core/dev_weight", unix.F_OK) + switch err { + case nil: + return true, nil + case unix.ENOENT: + return false, nil + default: + return false, fmt.Errorf("failed to access /proc/sys/net/core/dev_weight: %v", err) } - - return nil } // isAddressOnInterface checks if an address is on an interface diff --git a/runsc/sandbox/no_xdp.go b/runsc/sandbox/no_xdp.go index 04f3cf9520..7e4b0cbfd1 100644 --- a/runsc/sandbox/no_xdp.go +++ b/runsc/sandbox/no_xdp.go @@ -23,6 +23,7 @@ import ( "os" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/config" ) @@ -43,3 +44,7 @@ func createSocketXDP(iface net.Interface) ([]*os.File, error) { func createXDPTunnel(conn *urpc.Client, nsPath string, conf *config.Config) error { return errors.New(noXDPMsg) } + +func prepareRedirectInterfaceArgs(bind boot.BindOpt, conf *config.Config) (boot.CreateLinksAndRoutesArgs, net.Interface, error) { + return boot.CreateLinksAndRoutesArgs{}, net.Interface{}, errors.New(noXDPMsg) +} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 4695003a53..280fc75f0e 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -529,8 +529,14 @@ func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, dir defer conn.Close() // Configure the network. - if err := setupNetwork(conn, s.Pid.load(), conf); err != nil { - return fmt.Errorf("setting up network: %v", err) + netConf, err := getNetworkConfArgs(s.Pid.load(), conf) + if err != nil { + return fmt.Errorf("getNetworkConfArgs failed with error: %v", err) + } + + // Store the network config in the loader. + if err := conn.Call(boot.ContMgrStoreNetworkConfig, netConf, nil); err != nil { + return fmt.Errorf("storing network args %q: %v", cid, err) } // Restore the container and start the root container.