diff --git a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml index 4b398f3bd..197f12b7a 100644 --- a/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml +++ b/deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml @@ -59,32 +59,6 @@ spec: shareProcessNamespace: true {{- end }} initContainers: - - image: {{ include "nvidia-device-plugin.fullimage" . }} - name: gpu-feature-discovery-imex-init - command: ["/bin/bash", "-c"] - args: - - | - IMEX_NODES_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg - if [[ -f /config/${IMEX_NODES_CONFIG_FILE} ]]; then - echo "Removing cached IMEX nodes config" - rm -f /config/${IMEX_NODES_CONFIG_FILE} - fi - - if [[ ! -f /driver-root/${IMEX_NODES_CONFIG_FILE} ]]; then - echo "No IMEX nodes config path detected; Skipping" - exit 0 - fi - - echo "Copying IMEX nodes config" - mkdir -p $(dirname /config/${IMEX_NODES_CONFIG_FILE}) - cp /driver-root/${IMEX_NODES_CONFIG_FILE} /config/${IMEX_NODES_CONFIG_FILE} - volumeMounts: - - name: config - mountPath: /config - - name: driver-root - mountPath: /driver-root/etc - subPath: etc - readOnly: true {{- if $options.hasConfigMap }} - image: {{ include "nvidia-device-plugin.fullimage" . }} name: gpu-feature-discovery-init diff --git a/docs/gpu-feature-discovery/README.md b/docs/gpu-feature-discovery/README.md index 8d7a90a11..c5450bf55 100644 --- a/docs/gpu-feature-discovery/README.md +++ b/docs/gpu-feature-discovery/README.md @@ -221,7 +221,6 @@ For a similar list of labels generated or used by the device plugin, see [here]( | nvidia.com/gpu.replicas | String | Number of GPU replicas available. Will be equal to the number of physical GPUs unless some sharing strategy is employed in which case the GPU count will be multiplied by replicas. | 4 | | nvidia.com/gpu.mode | String | Mode of the GPU. Can be either "compute" or "display". Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute | | nvidia.com/gpu.clique | String | GPUFabric ClusterUUID + CliqueID | 7b968a6d-c8aa-45e1-9e07-e1e51be99c31.1 | -| nvidia.com/gpu.imex-domain | String | IMEX domain Ip list(Hashed) + CliqueID | 79b326e7-d566-3483-c2a3-9b38fa5cb1c8.1 | Depending on the MIG strategy used, the following set of labels may also be available (or override the default values for some of the labels listed above): diff --git a/internal/lm/imex.go b/internal/lm/imex.go index cf0ef092a..ff31abd98 100644 --- a/internal/lm/imex.go +++ b/internal/lm/imex.go @@ -17,92 +17,26 @@ package lm import ( - "bufio" - "errors" "fmt" - "io" - "net" - "os" - "path/filepath" - "sort" "strings" - "github.com/google/uuid" "k8s.io/klog/v2" spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1" "github.com/NVIDIA/k8s-device-plugin/internal/resource" ) -const ( - // ImexNodesConfigFilePath is the path to the IMEX nodes config file. - // This file contains a list of IP addresses of the nodes in the IMEX domain. - ImexNodesConfigFilePath = "/etc/nvidia-imex/nodes_config.cfg" -) - func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) { - var errs error - for _, root := range imexNodesConfigFilePathSearchRoots(config) { - configFilePath := filepath.Join(root, ImexNodesConfigFilePath) - imexLabeler, err := imexLabelerForConfigFile(configFilePath, devices) - if err != nil { - errs = errors.Join(errs, err) - continue - } - if imexLabeler != nil { - klog.Infof("Using labeler for IMEX config %v", configFilePath) - return imexLabeler, nil - } - } - if errs != nil { - return nil, errs - } - - return empty{}, nil -} - -// imexNodesConfigFilePathSearchRoots returns a list of roots to search for the IMEX nodes config file. -func imexNodesConfigFilePathSearchRoots(config *spec.Config) []string { - // By default, search / and /config for config files. - roots := []string{"/", "/config"} - - if config == nil || config.Flags.Plugin == nil || config.Flags.Plugin.ContainerDriverRoot == nil { - return roots - } - - // If a driver root is specified, it is also searched. - return append(roots, *config.Flags.Plugin.ContainerDriverRoot) -} - -func imexLabelerForConfigFile(configFilePath string, devices []resource.Device) (Labeler, error) { - imexConfigFile, err := os.Open(configFilePath) - if os.IsNotExist(err) { - // No imex config file, return empty labels - return nil, nil - } else if err != nil { - return nil, fmt.Errorf("failed to open imex config file: %v", err) - } - defer imexConfigFile.Close() - clusterUUID, cliqueID, err := getFabricIDs(devices) if err != nil { return nil, err } if clusterUUID == "" || cliqueID == "" { - return nil, nil - } - - imexDomainID, err := getImexDomainID(imexConfigFile) - if err != nil { - return nil, err - } - if imexDomainID == "" { - return nil, nil + return empty{}, nil } labels := Labels{ - "nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."), - "nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."), + "nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."), } return labels, nil @@ -147,36 +81,3 @@ func getFabricIDs(devices []resource.Device) (string, string, error) { } return "", "", nil } - -// getImexDomainID reads the imex config file and returns a unique identifier -// based on the sorted list of IP addresses in the file. -func getImexDomainID(r io.Reader) (string, error) { - // Read the file line by line - var ips []string - scanner := bufio.NewScanner(r) - for scanner.Scan() { - ip := strings.TrimSpace(scanner.Text()) - if net.ParseIP(ip) == nil { - return "", fmt.Errorf("invalid IP address in imex config file: %s", ip) - } - ips = append(ips, ip) - } - - if err := scanner.Err(); err != nil { - return "", fmt.Errorf("failed to read imex config file: %v", err) - } - - if len(ips) == 0 { - // No IPs in the file, return empty labels - return "", nil - } - - sort.Strings(ips) - - return generateContentUUID(strings.Join(ips, "\n")), nil - -} - -func generateContentUUID(seed string) string { - return uuid.NewSHA1(uuid.Nil, []byte(seed)).String() -} diff --git a/internal/lm/imex_test.go b/internal/lm/imex_test.go index 553976465..8b7deb911 100644 --- a/internal/lm/imex_test.go +++ b/internal/lm/imex_test.go @@ -15,43 +15,3 @@ **/ package lm - -import ( - "strings" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestGerenerateDomainUUID(t *testing.T) { - testCases := []struct { - description string - ips []string - expected string - }{ - { - description: "single IP", - ips: []string{"10.130.3.24"}, - expected: "60ad7226-0130-54d0-b762-2a5385a3a26f", - }, - { - description: "multiple IPs", - ips: []string{ - "10.130.3.24", - "10.130.3.53", - "10.130.3.23", - "10.130.3.31", - "10.130.3.27", - "10.130.3.25", - }, - expected: "8a7363e9-1003-5814-9354-175fdff19204", - }, - } - - for _, tc := range testCases { - t.Run(tc.description, func(t *testing.T) { - id := generateContentUUID(strings.Join(tc.ips, "\n")) - require.Equal(t, tc.expected, id) - }) - } -}