Back to Llama Cpp

Dgx Spark

benches/dgx-spark/dgx-spark.md

latest25.6 KB
Original Source

System info

bash
uname --all
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux

g++ --version
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0

nvidia-smi
Thu Feb  5 13:49:40 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
| N/A   47C    P0             13W /  N/A  | Not Supported          |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

ggml-org/gpt-oss-20b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.2701895.570.39980.130.669812.60
51232210880.2304451.230.583109.710.8131337.56
51232421760.4374688.870.820156.031.2571730.91
51232843520.8634744.230.942271.791.8052410.73
512321687041.7254748.191.173436.382.8993002.85
5123232174083.4374767.381.503681.494.9393524.40
409632141280.9074513.910.40778.541.3153139.56
409632282561.7964560.420.625102.372.4223409.45
4096324165123.5964555.660.888144.114.4853681.93
4096328330247.1844561.441.098233.118.2823987.51
409632166604814.3694560.821.503340.7415.8724161.30
4096323213209628.7604557.522.162473.5930.9224271.95
819232182241.8594405.590.43074.362.2903591.61
8192322164483.6984430.020.65697.594.3543777.47
8192324328967.4034426.100.957133.828.3603934.97
81923286579214.8024427.631.222209.4416.0244105.87
8192321613158429.5964428.671.741294.1331.3374199.00
8192323226316859.1694430.422.619390.9261.7894259.17
  • llama-bench
modelsizeparamsbackendngln_ubatchfammapdiotestt/s
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101pp20484505.82 ± 12.90
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101tg3283.43 ± 0.59
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101pp2048 @ d40964158.34 ± 18.84
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101tg32 @ d409679.22 ± 0.60
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101pp2048 @ d81923993.81 ± 17.55
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101tg32 @ d819275.22 ± 1.05
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101pp2048 @ d163843449.98 ± 12.13
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101tg32 @ d1638470.36 ± 0.37
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101pp2048 @ d327682689.42 ± 18.89
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BCUDA992048101tg32 @ d3276861.65 ± 0.30

build: 11fb327bf (7941)

ggml-org/gpt-oss-120b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.4451151.800.56057.141.005541.53
51232210880.4722169.850.87473.271.345808.65
51232421760.8262480.331.29998.512.1251023.94
51232843521.6442491.671.608159.183.2521338.20
512321687043.2922488.352.117241.855.4091609.13
5123232174086.6042481.072.898353.319.5021832.04
409632141281.6982412.650.58055.212.2771812.66
409632282563.3992409.880.93468.534.3331905.27
4096324165126.8232401.211.41190.728.2342005.30
40963283302413.5742413.971.841139.0715.4152142.31
409632166604827.1762411.522.609196.2629.7852217.49
4096323213209654.3592411.233.905262.2058.2642267.19
819232182243.4912346.810.61352.234.1032004.21
8192322164486.9392361.030.98165.217.9212076.56
81923243289613.8882359.401.51184.7115.3992136.21
81923286579227.7562361.182.034125.8629.7902208.56
8192321613158455.5542359.343.021169.4958.5752246.41
81923232263168111.0362360.894.537225.72115.5732277.08
  • llama-bench
modelsizeparamsbackendngln_ubatchfammapdiotestt/s
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101pp20482443.91 ± 7.47
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101tg3258.72 ± 0.20
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101pp2048 @ d40962309.84 ± 3.63
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101tg32 @ d409655.67 ± 0.35
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101pp2048 @ d81922216.68 ± 10.16
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101tg32 @ d819252.87 ± 0.43
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101pp2048 @ d163841956.31 ± 6.39
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101tg32 @ d1638449.45 ± 0.20
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101pp2048 @ d327681567.08 ± 11.79
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BCUDA992048101tg32 @ d3276842.76 ± 0.14

build: 11fb327bf (7941)

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.3931303.730.54858.360.941578.10
51232210880.3872648.680.91070.351.296839.27
51232421760.6593107.631.30298.331.9611109.77
51232843521.3223099.351.669153.422.9901455.43
512321687042.6393104.632.212231.444.8511794.32
5123232174085.2843100.802.955346.538.2392112.93
409632141281.4172890.360.59853.512.0152048.45
409632282562.8292895.621.01962.823.8482145.60
4096324165125.6562896.961.52883.797.1832298.71
40963283302411.3382890.022.127120.3613.4652452.53
409632166604822.7092885.963.104164.9725.8122558.79
4096323213209645.3012893.354.723216.8050.0242640.63
819232182243.0222711.090.67847.203.7002222.89
8192322164486.0392713.011.14955.707.1882288.21
81923243289612.0502719.351.78571.6913.8352377.67
81923286579224.1132717.902.62997.3926.7412460.31
8192321613158448.1782720.584.099124.9152.2772517.06
8192323226316896.4012719.316.696152.93103.0972552.63
  • llama-bench
modelsizeparamsbackendngln_ubatchfammapdiotestt/s
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101pp20482986.97 ± 18.87
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101tg3261.06 ± 0.23
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101pp2048 @ d40962633.45 ± 6.26
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101tg32 @ d409654.77 ± 0.28
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101pp2048 @ d81922354.14 ± 3.84
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101tg32 @ d819248.02 ± 0.40
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101pp2048 @ d163841908.86 ± 4.25
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101tg32 @ d1638440.23 ± 0.10
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101pp2048 @ d327681348.17 ± 2.00
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BCUDA992048101tg32 @ d3276830.21 ± 0.04

build: 11fb327bf (7941)

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.2122420.121.10029.101.311414.85
51232210880.4282393.891.18554.001.613674.56
51232421760.8942290.411.229104.172.1231025.02
51232843521.7582330.361.319194.153.0761414.70
512321687043.5082335.211.543331.905.0511723.33
5123232174087.0352328.931.738589.218.7731984.29
409632141281.8312237.251.12528.442.9561396.42
409632282563.6422249.481.25351.074.8951686.64
4096324165127.2742252.261.38092.728.6551907.81
40963283302414.5762248.091.617158.2916.1932039.37
409632166604829.1382249.172.081246.0131.2192115.63
4096323213209658.2752249.192.814363.8761.0892162.34
819232182243.7572180.261.18427.034.9411664.37
8192322164487.5222178.051.34147.738.8631855.77
81923243289615.0432178.251.54882.6916.5911982.74
81923286579230.1112176.491.937132.1332.0482052.90
8192321613158460.4052169.902.706189.2163.1112084.97
81923232263168120.4392176.583.993256.46124.4322114.96
  • llama-bench
modelsizeparamsbackendngln_ubatchfammapdiotestt/s
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101pp20482250.28 ± 6.41
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101tg3229.43 ± 0.02
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101pp2048 @ d40962100.19 ± 8.96
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101tg32 @ d409628.61 ± 0.02
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101pp2048 @ d81922007.56 ± 4.16
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101tg32 @ d819227.38 ± 0.09
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101pp2048 @ d163841779.11 ± 6.42
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101tg32 @ d1638425.72 ± 0.03
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101pp2048 @ d327681471.23 ± 1.71
qwen2 7B Q8_07.54 GiB7.62 BCUDA992048101tg32 @ d3276822.51 ± 0.02

build: 11fb327bf (7941)

ggml-org/gemma-3-4b-it-qat-GGUF

Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.0925566.970.41277.630.5041078.95
51232210880.1616345.670.522122.700.6831593.06
51232421760.3256309.870.562227.680.8872453.87
51232843520.6436374.420.685373.671.3283277.94
512321687041.2776413.640.915559.472.1923970.01
5123232174082.5186506.571.249819.613.7674620.64
409632141280.6746079.680.45370.601.1273662.88
409632282561.3356137.820.627102.031.9624208.11
4096324165122.6576167.350.749170.923.4054848.71
4096328330245.3076173.910.974262.896.2815257.53
409632166604810.6106176.961.379371.4211.9885509.40
4096323213209621.2136178.892.122482.5023.3355660.82
819232182241.3596027.340.46768.521.8264503.48
8192322164482.6996069.680.65398.033.3524906.68
8192324328965.3666106.740.818156.556.1845319.96
81923286579210.7556093.501.174218.0411.9295515.22
8192321613158421.4846100.821.829279.9023.3145644.11
8192323226316842.9506103.403.058334.9146.0085720.05
  • llama-bench
modelsizeparamsbackendngln_ubatchfammapdiotestt/s
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101pp20485948.74 ± 10.61
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101tg3281.05 ± 0.20
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101pp2048 @ d40965652.69 ± 34.29
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101tg32 @ d409676.37 ± 0.58
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101pp2048 @ d81925509.57 ± 40.69
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101tg32 @ d819271.61 ± 0.80
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101pp2048 @ d163845340.86 ± 36.92
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101tg32 @ d1638470.89 ± 0.34
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101pp2048 @ d327685023.30 ± 13.52
gemma3 4B Q4_02.35 GiB3.88 BCUDA992048101tg32 @ d3276862.28 ± 0.30

build: 11fb327bf (7941)

ggml-org/GLM-4.7-Flash-GGUF

Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.4331181.830.69346.161.126482.94
51232210880.4392334.461.03461.891.473738.75
51232421760.7722654.461.45987.762.230975.77
51232843521.5412658.782.043125.313.5831214.47
512321687043.0832656.912.675191.425.7581511.62
5123232174086.1592660.123.615283.249.7741780.98
409632141281.9152139.300.72544.142.6401563.83
409632282563.8342136.401.11957.214.9531666.81
4096324165127.6362145.721.63178.499.2661781.93
40963283302415.2952142.402.344109.2117.6391872.20
409632166604830.5732143.623.773135.7034.3461923.04
4096323213209661.2822138.825.795176.7167.0771969.31
819232182244.5101816.240.76042.115.2701560.44
8192322164489.0361813.191.20653.0610.2421605.91
81923243289618.0701813.431.78371.8019.8521657.03
81923286579236.1251814.152.63597.1438.7601697.41
8192321613158472.3671811.204.954103.3477.3221701.77
81923232263168144.5011814.138.103126.37152.6041724.51
  • llama-bench
modelsizeparamsbackendngln_ubatchfadiotestt/s
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811pp20482364.18 ± 11.43
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811tg3248.68 ± 0.12
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811pp2048 @ d40961684.13 ± 1.24
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811tg32 @ d409644.62 ± 0.22
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811pp2048 @ d81921314.68 ± 1.41
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811tg32 @ d819242.59 ± 0.11
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811pp2048 @ d16384914.05 ± 3.32
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811tg32 @ d1638438.72 ± 0.13
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811pp2048 @ d32768567.20 ± 0.90
deepseek2 30B.A3B Q8_029.65 GiB29.94 BCUDA99204811tg32 @ d3276832.65 ± 0.09

build: 11fb327bf (7941)