Back to Llama Cpp

Mac M2 Ultra

benches/mac-m2-ultra/mac-m2-ultra.md

latest24.1 KB
Original Source

System info

bash
uname -a
Darwin gg-studio 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:07:05 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6020 arm64

g++ --version
Apple clang version 17.0.0 (clang-1700.3.19.1)
Target: arm64-apple-darwin25.2.0

ggml-org/gpt-oss-20b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.2152381.350.245130.450.4601181.81
51232210880.3792701.430.382167.560.7611429.67
51232421760.7212839.270.604211.761.3261641.32
51232843521.4332858.301.033247.752.4661764.57
512321687042.8532871.121.570326.114.4231967.77
5123232174085.6992874.951.910536.157.6092287.88
409632141281.5522638.560.33495.721.8872188.00
409632282563.0842655.880.404158.543.4882366.86
4096324165126.1512663.780.652196.396.8022427.37
40963283302412.2882666.771.135225.4713.4232460.27
409632166604824.5632668.121.762290.5526.3252508.97
4096323213209649.1142668.732.398426.9451.5122564.35
819232182243.3452448.780.275116.463.6202271.76
8192322164486.6652458.110.425150.717.0902319.91
81923243289613.3152460.920.691185.2114.0062348.63
81923286579226.6112462.731.212211.1627.8232364.62
8192321613158453.2322462.271.919266.8355.1512385.88
81923232263168110.4552373.302.752372.03113.2082324.64
  • llama-bench
modelsizeparamsbackendthreadsn_ubatchfatestt/s
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481pp20482713.40 ± 3.56
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481tg32129.97 ± 3.90
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481pp2048 @ d40962324.59 ± 3.01
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481tg32 @ d4096123.38 ± 0.17
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481pp2048 @ d81921989.82 ± 30.11
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481tg32 @ d8192117.39 ± 0.33
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481pp2048 @ d163841556.54 ± 6.22
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481tg32 @ d16384109.75 ± 0.42
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481pp2048 @ d327681122.63 ± 1.45
gpt-oss 20B MXFP4 MoE11.27 GiB20.91 BMTL,BLAS1620481tg32 @ d3276898.25 ± 0.08

build: b828e18c7 (7948)

ggml-org/gpt-oss-120b-GGUF

Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.4261200.920.36188.560.788690.64
51232210880.6831500.140.545117.351.228886.02
51232421761.2041701.560.847151.192.0501061.34
51232843522.4021705.201.455176.003.8571128.45
512321687044.8021705.902.349217.937.1521217.08
5123232174089.5931707.853.665279.4213.2581313.01
409632141282.5811587.080.39082.122.9701389.67
409632282565.1241598.790.589108.625.7131445.10
40963241651210.2311601.470.928137.9811.1581479.80
40963283302420.4681600.941.606159.3822.0741496.04
409632166604840.9241601.422.639193.9943.5631516.15
4096323213209681.8191601.984.466229.2986.2841530.94
819232182245.5171484.740.40978.165.9271387.58
81923221644811.0081488.430.622102.9211.6291414.34
81923243289622.0021489.290.987129.6622.9901430.90
81923286579246.0511423.111.858137.7947.9091373.27
8192321613158497.6801341.852.872178.28100.5521308.62
81923232263168176.4071486.025.048202.85181.4551450.32
  • llama-bench
modelsizeparamsbackendthreadsn_ubatchfatestt/s
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481pp20481648.69 ± 1.80
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481tg3285.60 ± 0.52
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481pp2048 @ d40961429.86 ± 1.01
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481tg32 @ d409682.03 ± 0.12
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481pp2048 @ d81921257.90 ± 1.81
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481tg32 @ d819278.23 ± 0.33
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481pp2048 @ d163841013.49 ± 0.70
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481tg32 @ d1638473.20 ± 0.28
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481pp2048 @ d32768721.11 ± 0.58
gpt-oss 120B MXFP4 MoE59.02 GiB116.83 BMTL,BLAS1620481tg32 @ d3276865.52 ± 0.10

build: b828e18c7 (7948)

ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.2432109.230.41976.340.662821.84
51232210880.4062521.400.575111.360.9811109.27
51232421760.7442751.650.841152.221.5851372.71
51232843521.4792770.201.330192.482.8091549.53
512321687042.9512776.202.572199.055.5231575.93
5123232174085.8992777.642.603393.348.5022047.54
409632141281.9012154.150.47467.582.3751738.14
409632282563.7882162.890.65298.174.4391859.69
4096324165127.5642166.180.990129.248.5541930.34
40963283302415.1212166.981.632156.8216.7541971.12
409632166604830.2412167.093.166161.7233.4071977.04
4096323213209660.4742167.423.780270.9364.2542055.86
819232182244.7331730.920.48366.295.2151576.85
8192322164489.4591732.090.72288.5810.1821615.46
81923243289618.9121732.651.120114.2620.0321642.14
81923286579237.7971733.911.873136.6739.6701658.49
8192321613158484.1331557.923.718137.7287.8501497.82
81923232263168157.5501663.884.854210.98162.4031620.46
  • llama-bench
modelsizeparamsbackendthreadsn_ubatchfatestt/s
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481pp20482453.11 ± 1.70
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481tg3278.97 ± 0.46
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481pp2048 @ d40961569.46 ± 1.97
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481tg32 @ d409671.18 ± 0.37
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481pp2048 @ d81921145.51 ± 1.16
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481tg32 @ d819265.11 ± 0.36
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481pp2048 @ d16384741.04 ± 0.74
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481tg32 @ d1638456.87 ± 0.14
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481pp2048 @ d32768431.31 ± 0.31
qwen3moe 30B.A3B Q8_030.25 GiB30.53 BMTL,BLAS1620481tg32 @ d3276845.26 ± 0.11

build: b828e18c7 (7948)

ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.3391509.220.40978.170.749726.67
51232210880.6461584.930.483132.451.129963.45
51232421761.2581627.500.585218.671.8441180.21
51232843522.5061634.411.005254.833.5111239.64
512321687045.0071635.991.595321.076.6021318.38
51232321740810.0071637.191.676611.1211.6831490.03
409632141282.7301500.460.43174.313.1601306.12
409632282565.4461504.330.524122.045.9701382.91
40963241651210.8751506.590.662193.4511.5371431.28
40963283302421.7491506.611.158221.1122.9071441.64
409632166604843.4771507.361.901269.3245.3781455.49
4096323213209686.9541507.372.325440.4289.2791479.59
819232182245.9401379.210.44971.206.3891287.20
81923221644811.8651380.840.559114.5912.4241323.92
81923243289623.7231381.250.728175.8024.4521345.35
81923286579247.4341381.631.279200.0948.7131350.60
8192321613158494.8641381.692.198232.9797.0611355.68
81923232263168189.7431381.573.052335.50192.7951365.01
  • llama-bench
modelsizeparamsbackendthreadsn_ubatchfatestt/s
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481pp20481565.91 ± 0.86
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481tg3279.68 ± 0.39
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481pp2048 @ d40961317.41 ± 1.02
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481tg32 @ d409674.70 ± 0.04
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481pp2048 @ d81921134.65 ± 0.76
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481tg32 @ d819271.31 ± 0.12
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481pp2048 @ d16384886.46 ± 0.78
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481tg32 @ d1638465.93 ± 0.06
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481pp2048 @ d32768612.21 ± 0.30
qwen2 7B Q8_07.54 GiB7.62 BMTL,BLAS1620481tg32 @ d3276856.83 ± 0.02

build: b828e18c7 (7948)

ggml-org/gemma-3-4b-it-qat-GGUF

Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.1862748.060.235136.280.4211291.78
51232210880.3422990.950.312204.990.6551662.15
51232421760.6623092.690.404316.971.0662041.21
51232843521.3173110.410.579441.801.8962294.97
512321687042.6253120.231.207424.083.8332270.93
5123232174085.2423125.341.299788.236.5412661.19
409632141281.4082909.900.296108.071.7042422.95
409632282562.7932933.400.325197.003.1182648.25
4096324165125.5672943.220.440291.076.0062749.05
40963283302411.1142948.230.640400.2611.7542809.59
409632166604822.2172949.761.327385.8323.5442805.26
4096323213209644.4202950.771.553659.3045.9732873.36
819232182242.8602864.580.250127.903.1102644.42
8192322164485.7022873.630.335191.076.0362724.77
81923243289611.3832878.690.456280.7211.8392778.63
81923286579222.7502880.750.671381.4823.4212809.14
8192321613158445.4842881.741.406364.0446.8902806.22
8192323226316890.9562882.101.793570.9892.7492837.41
  • llama-bench
modelsizeparamsbackendthreadsn_ubatchfatestt/s
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481pp20482923.59 ± 3.10
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481tg32134.28 ± 1.29
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481pp2048 @ d40962748.21 ± 3.05
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481tg32 @ d4096133.11 ± 0.08
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481pp2048 @ d81922641.45 ± 2.31
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481tg32 @ d8192125.85 ± 0.35
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481pp2048 @ d163842446.20 ± 2.94
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481tg32 @ d16384125.00 ± 0.12
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481pp2048 @ d327682129.18 ± 7.43
gemma3 4B Q4_02.35 GiB3.88 BMTL,BLAS1620481tg32 @ d32768113.14 ± 0.10

build: b828e18c7 (7948)

ggml-org/GLM-4.7-Flash-GGUF

Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF

  • llama-batched-bench

main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16

PPTGBN_KVT_PP sS_PP t/sT_TG sS_TG t/sT sS t/s
5123215440.3261568.690.52261.280.849641.09
51232210880.5281939.420.74486.071.272855.63
51232421760.9682114.851.105115.852.0731049.56
51232843521.9282124.621.684151.993.6121204.82
512321687043.8442131.343.141162.996.9851246.11
5123232174087.6832132.383.924260.9511.6081499.71
409632141283.2801248.750.72344.294.0031031.33
409632282566.5451251.630.93068.857.4751104.53
40963241651213.0801252.641.45488.0314.5341136.12
40963283302426.1541252.902.388107.2028.5421157.04
409632166604852.2971253.144.724108.3757.0221158.30
40963232132096104.5781253.347.266140.93111.8441181.08
819232182249.623851.310.76741.7210.390791.54
81923221644820.916783.321.14855.7422.064745.45
81923243289643.509753.141.83369.8245.342725.51
81923286579279.621823.103.18080.5082.801794.58
81923216131584153.770852.396.50278.74160.272821.00
81923232263168307.539852.3910.83994.48318.378826.59
  • llama-bench
modelsizeparamsbackendthreadsn_ubatchfatestt/s
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481pp20481629.33 ± 0.27
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481tg3259.58 ± 0.13
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481pp2048 @ d4096732.67 ± 0.42
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481tg32 @ d409647.44 ± 0.15
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481pp2048 @ d8192474.33 ± 0.33
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481tg32 @ d819240.20 ± 0.20
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481pp2048 @ d16384277.46 ± 0.09
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481tg32 @ d1638431.50 ± 0.93
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481pp2048 @ d32768151.44 ± 0.05
deepseek2 30B.A3B Q8_029.65 GiB29.94 BMTL,BLAS1620481tg32 @ d3276821.81 ± 0.01

build: b828e18c7 (7948)