[Kernel] Replaced blockReduce[...] functions with cub::BlockReduce (#7233)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-08-21 20:18:00 -04:00
parent 9984605412
commit 7937009a7e
8 changed files with 237 additions and 116 deletions
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -83,7 +83,7 @@ def test_models(
                                                      for m in E4M3_KV_MODELS])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
@pytest.mark.parametrize("enforce_eager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.