Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,32 +59,6 @@ spec:
shareProcessNamespace: true
{{- end }}
initContainers:
- image: {{ include "nvidia-device-plugin.fullimage" . }}
name: gpu-feature-discovery-imex-init
command: ["/bin/bash", "-c"]
args:
- |
IMEX_NODES_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg
if [[ -f /config/${IMEX_NODES_CONFIG_FILE} ]]; then
echo "Removing cached IMEX nodes config"
rm -f /config/${IMEX_NODES_CONFIG_FILE}
fi

if [[ ! -f /driver-root/${IMEX_NODES_CONFIG_FILE} ]]; then
echo "No IMEX nodes config path detected; Skipping"
exit 0
fi

echo "Copying IMEX nodes config"
mkdir -p $(dirname /config/${IMEX_NODES_CONFIG_FILE})
cp /driver-root/${IMEX_NODES_CONFIG_FILE} /config/${IMEX_NODES_CONFIG_FILE}
volumeMounts:
- name: config
mountPath: /config
- name: driver-root
mountPath: /driver-root/etc
subPath: etc
readOnly: true
{{- if $options.hasConfigMap }}
- image: {{ include "nvidia-device-plugin.fullimage" . }}
name: gpu-feature-discovery-init
Expand Down
1 change: 0 additions & 1 deletion docs/gpu-feature-discovery/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ For a similar list of labels generated or used by the device plugin, see [here](
| nvidia.com/gpu.replicas | String | Number of GPU replicas available. Will be equal to the number of physical GPUs unless some sharing strategy is employed in which case the GPU count will be multiplied by replicas. | 4 |
| nvidia.com/gpu.mode | String | Mode of the GPU. Can be either "compute" or "display". Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute |
| nvidia.com/gpu.clique | String | GPUFabric ClusterUUID + CliqueID | 7b968a6d-c8aa-45e1-9e07-e1e51be99c31.1 |
| nvidia.com/gpu.imex-domain | String | IMEX domain Ip list(Hashed) + CliqueID | 79b326e7-d566-3483-c2a3-9b38fa5cb1c8.1 |

Depending on the MIG strategy used, the following set of labels may also be
available (or override the default values for some of the labels listed above):
Expand Down
103 changes: 2 additions & 101 deletions internal/lm/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,92 +17,26 @@
package lm

import (
"bufio"
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"sort"
"strings"

"github.com/google/uuid"
"k8s.io/klog/v2"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
)

const (
// ImexNodesConfigFilePath is the path to the IMEX nodes config file.
// This file contains a list of IP addresses of the nodes in the IMEX domain.
ImexNodesConfigFilePath = "/etc/nvidia-imex/nodes_config.cfg"
)

func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) {
var errs error
for _, root := range imexNodesConfigFilePathSearchRoots(config) {
configFilePath := filepath.Join(root, ImexNodesConfigFilePath)
imexLabeler, err := imexLabelerForConfigFile(configFilePath, devices)
if err != nil {
errs = errors.Join(errs, err)
continue
}
if imexLabeler != nil {
klog.Infof("Using labeler for IMEX config %v", configFilePath)
return imexLabeler, nil
}
}
if errs != nil {
return nil, errs
}

return empty{}, nil
}

// imexNodesConfigFilePathSearchRoots returns a list of roots to search for the IMEX nodes config file.
func imexNodesConfigFilePathSearchRoots(config *spec.Config) []string {
// By default, search / and /config for config files.
roots := []string{"/", "/config"}

if config == nil || config.Flags.Plugin == nil || config.Flags.Plugin.ContainerDriverRoot == nil {
return roots
}

// If a driver root is specified, it is also searched.
return append(roots, *config.Flags.Plugin.ContainerDriverRoot)
}

func imexLabelerForConfigFile(configFilePath string, devices []resource.Device) (Labeler, error) {
imexConfigFile, err := os.Open(configFilePath)
if os.IsNotExist(err) {
// No imex config file, return empty labels
return nil, nil
} else if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
defer imexConfigFile.Close()

clusterUUID, cliqueID, err := getFabricIDs(devices)
if err != nil {
return nil, err
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return nil, err
return empty{}, nil

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Here we are returning an actual error. Do you have a motivation as to why we want to ignore it?

}
if clusterUUID == "" || cliqueID == "" {
return nil, nil
}

imexDomainID, err := getImexDomainID(imexConfigFile)
if err != nil {
return nil, err
}
if imexDomainID == "" {
return nil, nil
return empty{}, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why empty{} instead of nil?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In order to avoid explicit nil checks at the callsite, we have an empty{} Labeler which returns no labels.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. Thanks.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we do the same for line (89/32)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we do the same for line (89/32)

No. On that line we're returning an actual error and we want to return nil as the labeler.

}

labels := Labels{
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
"nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."),
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
}

return labels, nil
Expand Down Expand Up @@ -147,36 +81,3 @@ func getFabricIDs(devices []resource.Device) (string, string, error) {
}
return "", "", nil
}

// getImexDomainID reads the imex config file and returns a unique identifier
// based on the sorted list of IP addresses in the file.
func getImexDomainID(r io.Reader) (string, error) {
// Read the file line by line
var ips []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return "", fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return "", fmt.Errorf("failed to read imex config file: %v", err)
}

if len(ips) == 0 {
// No IPs in the file, return empty labels
return "", nil
}

sort.Strings(ips)

return generateContentUUID(strings.Join(ips, "\n")), nil

}

func generateContentUUID(seed string) string {
return uuid.NewSHA1(uuid.Nil, []byte(seed)).String()
}
40 changes: 0 additions & 40 deletions internal/lm/imex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,43 +15,3 @@
**/

package lm

import (
"strings"
"testing"

"github.com/stretchr/testify/require"
)

func TestGerenerateDomainUUID(t *testing.T) {
testCases := []struct {
description string
ips []string
expected string
}{
{
description: "single IP",
ips: []string{"10.130.3.24"},
expected: "60ad7226-0130-54d0-b762-2a5385a3a26f",
},
{
description: "multiple IPs",
ips: []string{
"10.130.3.24",
"10.130.3.53",
"10.130.3.23",
"10.130.3.31",
"10.130.3.27",
"10.130.3.25",
},
expected: "8a7363e9-1003-5814-9354-175fdff19204",
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
id := generateContentUUID(strings.Join(tc.ips, "\n"))
require.Equal(t, tc.expected, id)
})
}
}