$ lspci -nnk | grep -i nvidia
00:05.0 3D controller [0302]: NVIDIA Corporation Device [10de:20b7] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:1532]
Kernel modules: nvidiafb
$ bin/cubectl addon list
┌────────────────┬─────────┬──────────┬─────────┬─────────────────────────────┐
│ ADDON NAME │ VERSION │ STATUS │ PROFILE │ VALUES PATH │
├────────────────┼─────────┼──────────┼─────────┼─────────────────────────────┤
│ csi-driver-nfs │ v4.8.0 │ disabled │ │ csi-driver-nfs/default.yaml │
│ gpu-operator │ v23.9.0 │ disabled │ │ gpu-operator/default.yaml │
│ │ │ │ redhat │ gpu-operator/redhat.yaml │
│ │ │ │ ubuntu │ gpu-operator/ubuntu.yaml │
│ kore-board │ 0.5.5 │ disabled │ │ kore-board/default.yaml │
└────────────────┴─────────┴──────────┴─────────┴─────────────────────────────┘
Duration 73.639078ms time
$ vi ${CUBE_HOME}/extends/addon/profile/gpu-operator/default.yaml
$ bin/cubectl addon enable gpu-operator
addon enable start: gpu-operator ...
addon enable complete: gpu-operator
Duration 52.093538621s time
$ bin/cubectl addon list
┌────────────────┬─────────┬────────────┬─────────┬─────────────────────────────┐
│ ADDON NAME │ VERSION │ STATUS │ PROFILE │ VALUES PATH │
├────────────────┼─────────┼────────────┼─────────┼─────────────────────────────┤
│ csi-driver-nfs │ v4.8.0 │ disabled │ │ csi-driver-nfs/default.yaml │
│ gpu-operator │ v23.9.0 │ enabled ✅ │ │ gpu-operator/default.yaml │
│ │ │ │ redhat │ gpu-operator/redhat.yaml │
│ │ │ │ ubuntu │ gpu-operator/ubuntu.yaml │
│ kore-board │ 0.5.5 │ disabled │ │ kore-board/default.yaml │
└────────────────┴─────────┴────────────┴─────────┴─────────────────────────────┘
Duration 75.061448ms time
$ kubectl get pods -n gpu-operator
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
gpu-operator gpu-operator-5564789746-rlpzk 1/1 Running 0 65s 10.4.185.65 cp-node-1 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-gc-78b479ccc6-ngfnd 1/1 Running 0 65s 10.4.211.67 wk-node-1 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-master-569bfcd8bc-5xb8h 1/1 Running 0 65s 10.4.111.193 cp-node-3 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-worker-dlxxh 1/1 Running 0 65s 10.4.111.194 cp-node-3 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-worker-fmlmb 1/1 Running 0 65s 10.4.185.66 cp-node-1 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-worker-gqn8z 1/1 Running 0 65s 10.4.238.68 cp-node-2 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-worker-pksh4 1/1 Running 0 65s 10.4.109.2 wk-node-2 <none> <none>
gpu-operator gpu-operator-node-feature-discovery-worker-xx6gb 1/1 Running 0 65s 10.4.211.66 wk-node-1 <none> <none>
$ bin/cubectl addon disable gpu-operator
MIG(multi instance GPU) 설정 방법
# 1g.5gb 인 경우
$ kubectl label nodes $NODE nvidia.com/mig.config=all-1g.5gb --overwrite
$ kubectl -n gpu-operator exec -it nvidia-dcgm-exporter-gc6bm bash
root@nvidia-dcgm-exporter-gc6bm:/# nvidia-smi
Thu Dec 7 06:02:55 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12 Driver Version: 535.104.12 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A30 On | 00000000:00:05.0 Off | On |
| N/A 63C P0 72W / 165W | N/A | N/A Default |
| | | Enabled |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| MIG devices: |
+------------------+--------------------------------+-----------+-----------------------+
| GPU GI CI MIG | Memory-Usage | Vol| Shared |
| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG |
| | | ECC| |
|==================+================================+===========+=======================|
| 0 3 0 0 | 12MiB / 5952MiB | 14 0 | 1 0 1 0 0 |
| | 0MiB / 8191MiB | | |
+------------------+--------------------------------+-----------+-----------------------+
| 0 4 0 1 | 12MiB / 5952MiB | 14 0 | 1 0 1 0 0 |
| | 0MiB / 8191MiB | | |
+------------------+--------------------------------+-----------+-----------------------+
| 0 5 0 2 | 12MiB / 5952MiB | 14 0 | 1 0 1 0 0 |
| | 0MiB / 8191MiB | | |
+------------------+--------------------------------+-----------+-----------------------+
| 0 6 0 3 | 12MiB / 5952MiB | 14 0 | 1 0 1 0 0 |
| | 0MiB / 8191MiB | | |
+------------------+--------------------------------+-----------+-----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
root@nvidia-dcgm-exporter-gc6bm:/# nvidia-smi -L
GPU 0: NVIDIA A30 (UUID: GPU-79e36614-3f62-d3dd-cdd0-48b00aa446e0)
MIG 1g.6gb Device 0: (UUID: MIG-39f52290-ccf4-5e32-b8b8-cc1877a32051)
MIG 1g.6gb Device 1: (UUID: MIG-dbf3834e-128b-5965-88b7-2e3d2fe5a0aa)
MIG 1g.6gb Device 2: (UUID: MIG-c735f798-c9d5-5c0e-972c-e0bc6cdb05e7)
MIG 1g.6gb Device 3: (UUID: MIG-233d355f-f84e-530d-8526-797b5a867669)
MIG를 지원하지 않는 장비에서 하나의 GPU 분할 사용
# 4개로 분할하는 경우
$ cat <<EOF > time-slicing-config-all.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: time-slicing-config-all
data:
any: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4
EOF
$ kubectl apply -n gpu-operator -f time-slicing-config-all.yaml
$ kubectl patch clusterpolicy/cluster-policy -n gpu-operator --type merge -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config-all", "default": "any"}}}}'
$ kubectl describe no $GPU-NODE