Back to Llama Cpp

NVIDIA DGX Spark

benches/nemotron/nemotron-dgx-spark.md

latest8.8 KB
Original Source

NVIDIA DGX Spark

System info

bash
uname --all
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux

g++ --version
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0

nvidia-smi
Fri Mar  6 11:39:45 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
| N/A   52C    P0             13W /  N/A  | Not Supported          |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

ggml-org/Nemotron-3-Super-120B-GGUF

Model: https://huggingface.co/ggml-org/Nemotron-3-Super-120B-GGUF

  • llama-batched-bench

main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215441.094468.051.62119.742.715200.37
51232210881.463700.162.43726.263.900279.01
51232421762.647773.764.04331.666.689325.29
51232843525.291774.146.15141.6211.442380.37
5123216870410.603772.6210.38549.3020.987414.72
51232321740821.231771.6918.23556.1639.466441.09
409632141285.340767.051.61619.816.956593.47
4096322825610.673767.552.45426.0813.127628.94
40963241651221.348767.464.07231.4425.420649.57
40963283302442.714767.156.27740.7848.991674.08
409632166604885.385767.5410.59648.3295.981688.14
40963232132096170.819767.3218.61955.00189.437697.31
8192321822410.690766.321.61919.7612.310668.10
81923221644821.382766.242.46725.9423.850689.65
81923243289642.782765.924.09831.2346.881701.69
81923286579285.582765.776.36840.2091.951715.52
81923216131584171.066766.2110.77447.52181.840723.62
81923232263168342.140766.1918.96953.98361.109728.78
  • llama-bench
modelsizeparamsbackendn_ubatchfatestt/s
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481pp2048768.84 ± 0.90
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481tg3219.94 ± 0.16
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481pp2048 @ d4096764.51 ± 0.50
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481tg32 @ d409619.95 ± 0.18
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481pp2048 @ d8192759.53 ± 0.71
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481tg32 @ d819219.83 ± 0.18
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481pp2048 @ d16384747.98 ± 1.58
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481tg32 @ d1638419.84 ± 0.18
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481pp2048 @ d32768724.40 ± 2.70
nemotron 120B.A12B Q4_K65.10 GiB120.67 BCUDA20481tg32 @ d3276819.45 ± 0.18

build: 04a65daab (8268)

ggml-org/Nemotron-3-Nano-4B-GGUF

Model: https://huggingface.co/ggml-org/Nemotron-3-Nano-4B-GGUF

  • llama-batched-bench

main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.1523371.610.59753.640.748726.90
51232210880.3193208.680.85774.661.176924.89
51232421760.7202843.561.32396.782.0431065.18
51232843521.4282867.962.311110.763.7391163.82
512321687042.8572866.944.203121.827.0601232.82
5123232174085.7092869.767.964128.5813.6731273.14
409632141281.4582809.760.60552.922.0622001.52
409632282562.9052819.950.87573.123.7802183.95
4096324165125.7902829.741.36194.077.1512309.17
40963283302411.5982825.322.378107.6513.9762362.89
409632166604823.2082823.884.348117.7627.5562396.89
4096323213209646.5152817.858.279123.6954.7942410.79
819232182242.9502776.950.61751.893.5672305.75
8192322164485.9212767.320.89671.456.8162413.05
81923243289611.8422767.211.40191.3413.2432484.03
81923286579223.7262762.172.461104.0326.1872512.38
8192321613158447.7772743.434.577111.8652.3542513.36
8192323226316896.6912711.168.772116.73105.4632495.36
  • llama-bench
modelsizeparamsbackendn_ubatchfatestt/s
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481pp20482761.90 ± 19.31
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481tg3252.85 ± 0.12
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481pp2048 @ d40962687.07 ± 21.84
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481tg32 @ d409652.32 ± 0.23
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481pp2048 @ d81922564.52 ± 57.69
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481tg32 @ d819251.27 ± 0.34
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481pp2048 @ d163842334.02 ± 37.83
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481tg32 @ d1638449.71 ± 0.14
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481pp2048 @ d327682041.46 ± 40.45
nemotron 4B Q8_03.94 GiB3.97 BCUDA20481tg32 @ d3276846.71 ± 0.13

build: 1bbec6a75 (8382)