# normal LLAMA_CUBLAS=1 make -j && ./batched-bench /workspace/openllama-7b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.126 | 4063.46 | 1.973 | 64.88 | 2.099 | 304.91 | | 512 | 128 | 2 | 768 | 0.130 | 3941.89 | 6.521 | 39.26 | 6.651 | 115.48 | | 512 | 128 | 3 | 896 | 0.118 | 4327.61 | 6.988 | 54.95 | 7.106 | 126.09 | | 512 | 128 | 4 | 1024 | 0.111 | 4597.99 | 6.209 | 82.47 | 6.320 | 162.03 | | 512 | 128 | 5 | 1152 | 0.110 | 4664.21 | 7.266 | 88.09 | 7.375 | 156.20 | | 512 | 128 | 6 | 1280 | 0.108 | 4745.09 | 7.300 | 105.20 | 7.408 | 172.78 | | 512 | 128 | 7 | 1408 | 0.111 | 4632.48 | 7.369 | 121.60 | 7.479 | 188.26 | | 512 | 128 | 8 | 1536 | 0.111 | 4603.20 | 7.200 | 142.22 | 7.312 | 210.08 | | 512 | 128 | 16 | 2560 | 0.111 | 4602.00 | 7.168 | 285.71 | 7.279 | 351.68 | | 512 | 128 | 32 | 4608 | 0.113 | 4521.81 | 7.693 | 532.46 | 7.806 | 590.32 | # SKIP_KQ_KQV=1 LLAMA_CUBLAS=1 make -j && SKIP_KQ_KQV=1 ./batched-bench /workspace/openllama-7b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.091 | 5632.38 | 1.708 | 74.95 | 1.799 | 355.83 | | 512 | 128 | 2 | 768 | 0.099 | 5159.00 | 1.722 | 148.66 | 1.821 | 421.67 | | 512 | 128 | 3 | 896 | 0.081 | 6284.37 | 1.722 | 223.05 | 1.803 | 496.94 | | 512 | 128 | 4 | 1024 | 0.075 | 6783.16 | 1.735 | 295.04 | 1.811 | 565.49 | | 512 | 128 | 5 | 1152 | 0.073 | 7008.32 | 1.752 | 365.28 | 1.825 | 631.19 | | 512 | 128 | 6 | 1280 | 0.082 | 6270.97 | 1.778 | 431.87 | 1.860 | 688.18 | | 512 | 128 | 7 | 1408 | 0.074 | 6954.07 | 1.787 | 501.46 | 1.860 | 756.83 | | 512 | 128 | 8 | 1536 | 0.084 | 6097.63 | 1.824 | 561.50 | 1.908 | 805.18 | | 512 | 128 | 16 | 2560 | 0.072 | 7090.53 | 1.985 | 1031.65 | 2.057 | 1244.31 | | 512 | 128 | 32 | 4608 | 0.084 | 6118.99 | 2.361 | 1734.66 | 2.445 | 1884.70 | # SKIP_KQ_ALL=1 LLAMA_CUBLAS=1 make -j && SKIP_KQ_ALL=1 ./batched-bench /workspace/openllama-7b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.078 | 6574.39 | 1.622 | 78.92 | 1.700 | 376.54 | | 512 | 128 | 2 | 768 | 0.069 | 7423.63 | 1.649 | 155.25 | 1.718 | 447.05 | | 512 | 128 | 3 | 896 | 0.059 | 8675.03 | 1.645 | 233.38 | 1.704 | 525.70 | | 512 | 128 | 4 | 1024 | 0.069 | 7436.13 | 1.723 | 297.17 | 1.792 | 571.50 | | 512 | 128 | 5 | 1152 | 0.059 | 8691.08 | 1.683 | 380.30 | 1.742 | 661.38 | | 512 | 128 | 6 | 1280 | 0.061 | 8415.79 | 1.666 | 460.93 | 1.727 | 741.15 | | 512 | 128 | 7 | 1408 | 0.069 | 7384.22 | 1.674 | 535.26 | 1.743 | 807.66 | | 512 | 128 | 8 | 1536 | 0.059 | 8710.30 | 1.689 | 606.37 | 1.748 | 878.96 | | 512 | 128 | 16 | 2560 | 0.070 | 7330.83 | 1.747 | 1172.30 | 1.817 | 1409.04 | | 512 | 128 | 32 | 4608 | 0.061 | 8420.91 | 1.920 | 2133.05 | 1.981 | 2326.03 | # normal + force 1 KV head LLAMA_CUBLAS=1 make -j && ./batched-bench /workspace/openllama-7b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.092 | 5589.03 | 1.878 | 68.15 | 1.970 | 324.90 | | 512 | 128 | 2 | 768 | 0.087 | 5869.74 | 1.843 | 138.87 | 1.931 | 397.79 | | 512 | 128 | 3 | 896 | 0.077 | 6656.61 | 1.858 | 206.70 | 1.935 | 463.13 | | 512 | 128 | 4 | 1024 | 0.084 | 6120.96 | 1.845 | 277.43 | 1.929 | 530.81 | | 512 | 128 | 5 | 1152 | 0.071 | 7204.98 | 1.899 | 337.08 | 1.970 | 584.86 | | 512 | 128 | 6 | 1280 | 0.086 | 5943.47 | 1.908 | 402.46 | 1.994 | 641.79 | | 512 | 128 | 7 | 1408 | 0.073 | 7014.95 | 1.936 | 462.79 | 2.009 | 700.82 | | 512 | 128 | 8 | 1536 | 0.084 | 6120.23 | 1.938 | 528.31 | 2.022 | 759.67 | | 512 | 128 | 16 | 2560 | 0.072 | 7110.02 | 2.109 | 971.01 | 2.181 | 1173.69 | | 512 | 128 | 32 | 4608 | 0.090 | 5705.43 | 2.528 | 1620.15 | 2.618 | 1760.19 | # normal LLAMA_CUBLAS=1 make -j && ./batched-bench /workspace/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.076 | 6716.34 | 0.662 | 193.33 | 0.738 | 866.82 | | 512 | 128 | 2 | 768 | 0.060 | 8520.84 | 3.218 | 79.56 | 3.278 | 234.29 | | 512 | 128 | 3 | 896 | 0.064 | 7995.38 | 3.258 | 117.86 | 3.322 | 269.70 | | 512 | 128 | 4 | 1024 | 0.054 | 9441.96 | 3.302 | 155.05 | 3.356 | 305.08 | | 512 | 128 | 5 | 1152 | 0.064 | 7943.90 | 3.259 | 196.39 | 3.323 | 346.65 | | 512 | 128 | 6 | 1280 | 0.053 | 9593.22 | 3.325 | 230.98 | 3.378 | 378.88 | | 512 | 128 | 7 | 1408 | 0.066 | 7770.41 | 3.425 | 261.62 | 3.491 | 403.36 | | 512 | 128 | 8 | 1536 | 0.054 | 9464.83 | 3.463 | 295.66 | 3.518 | 436.67 | | 512 | 128 | 16 | 2560 | 0.050 | 10192.71 | 3.369 | 607.92 | 3.419 | 748.74 | | 512 | 128 | 32 | 4608 | 0.056 | 9119.08 | 3.770 | 1086.37 | 3.826 | 1204.24 | # SKIP_KQ_KQV=1 LLAMA_CUBLAS=1 make -j && SKIP_KQ_KQV=1 ./batched-bench /workspace/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.061 | 8401.98 | 0.634 | 201.87 | 0.695 | 920.87 | | 512 | 128 | 2 | 768 | 0.121 | 4220.84 | 0.704 | 363.55 | 0.825 | 930.39 | | 512 | 128 | 3 | 896 | 0.028 | 18408.00 | 0.701 | 547.96 | 0.729 | 1229.77 | | 512 | 128 | 4 | 1024 | 0.027 | 19312.01 | 0.706 | 725.68 | 0.732 | 1398.81 | | 512 | 128 | 5 | 1152 | 0.030 | 16944.11 | 0.795 | 805.51 | 0.825 | 1396.80 | | 512 | 128 | 6 | 1280 | 0.028 | 18477.75 | 0.874 | 878.74 | 0.902 | 1419.56 | | 512 | 128 | 7 | 1408 | 0.028 | 18210.92 | 0.866 | 1034.25 | 0.894 | 1574.17 | | 512 | 128 | 8 | 1536 | 0.039 | 13086.60 | 0.907 | 1129.62 | 0.946 | 1624.32 | | 512 | 128 | 16 | 2560 | 0.029 | 17533.65 | 0.937 | 2186.38 | 0.966 | 2650.35 | | 512 | 128 | 32 | 4608 | 0.026 | 19496.59 | 1.270 | 3224.28 | 1.297 | 3553.84 | # SKIP_KQ_ALL=1 LLAMA_CUBLAS=1 make -j && SKIP_KQ_ALL=1 ./batched-bench /workspace/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.100 | 5144.38 | 0.700 | 182.90 | 0.799 | 800.63 | | 512 | 128 | 2 | 768 | 0.025 | 20631.85 | 0.792 | 323.26 | 0.817 | 940.31 | | 512 | 128 | 3 | 896 | 0.023 | 21933.77 | 0.682 | 562.83 | 0.706 | 1269.82 | | 512 | 128 | 4 | 1024 | 0.024 | 21400.21 | 0.633 | 809.26 | 0.657 | 1559.55 | | 512 | 128 | 5 | 1152 | 0.021 | 24842.31 | 0.697 | 918.88 | 0.717 | 1606.45 | | 512 | 128 | 6 | 1280 | 0.055 | 9246.38 | 0.805 | 953.94 | 0.860 | 1487.58 | | 512 | 128 | 7 | 1408 | 0.021 | 24369.35 | 0.814 | 1100.28 | 0.835 | 1685.53 | | 512 | 128 | 8 | 1536 | 0.026 | 19699.13 | 0.822 | 1245.98 | 0.848 | 1811.67 | | 512 | 128 | 16 | 2560 | 0.024 | 21178.03 | 0.919 | 2229.18 | 0.943 | 2715.03 | | 512 | 128 | 32 | 4608 | 0.029 | 17669.79 | 0.990 | 4136.84 | 1.019 | 4521.62 | # normal + force 1 KV head LLAMA_CUBLAS=1 make -j && ./batched-bench /workspace/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | 512 | 128 | 1 | 640 | 0.046 | 11088.49 | 0.649 | 197.28 | 0.695 | 920.88 | | 512 | 128 | 2 | 768 | 0.031 | 16488.47 | 0.803 | 318.99 | 0.834 | 921.32 | | 512 | 128 | 3 | 896 | 0.024 | 21380.55 | 0.757 | 507.03 | 0.781 | 1146.81 | | 512 | 128 | 4 | 1024 | 0.034 | 15196.49 | 0.783 | 653.82 | 0.817 | 1253.70 | | 512 | 128 | 5 | 1152 | 0.024 | 21340.45 | 0.772 | 829.33 | 0.796 | 1447.79 | | 512 | 128 | 6 | 1280 | 0.026 | 19529.31 | 0.980 | 783.49 | 1.006 | 1271.81 | | 512 | 128 | 7 | 1408 | 0.027 | 18856.11 | 0.849 | 1055.80 | 0.876 | 1607.68 | | 512 | 128 | 8 | 1536 | 0.035 | 14816.53 | 0.812 | 1261.46 | 0.846 | 1814.93 | | 512 | 128 | 16 | 2560 | 0.026 | 19875.00 | 1.247 | 1641.93 | 1.273 | 2010.89 | | 512 | 128 | 32 | 4608 | 0.032 | 15810.77 | 1.402 | 2921.63 | 1.434 | 3212.63 |