Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
61c7a1b856 | ||
|
|
374ee287d8 | ||
|
|
a4d83661d7 | ||
|
|
8363cd093d | ||
|
|
6c5a3195db | ||
|
|
073d1ed354 | ||
|
|
3d446433ec | ||
|
|
1fe0fd12d3 | ||
|
|
dafb4e504a | ||
|
|
68cf1601d3 | ||
|
|
61f412187d | ||
|
|
05ccd0aa35 | ||
|
|
f690372b68 | ||
|
|
8b3e94a357 | ||
|
|
437f9162d0 | ||
|
|
4f065f12f5 | ||
|
|
228b768db6 | ||
|
|
027827cc1d | ||
|
|
72a8639b68 | ||
|
|
99abb8b650 | ||
|
|
3a1e648158 | ||
|
|
46c759c165 | ||
|
|
179a619c21 | ||
|
|
452e8fd968 | ||
|
|
8b793f7ec6 | ||
|
|
af35d3a3cc | ||
|
|
3b457143d2 | ||
|
|
ab656f2c2f | ||
|
|
64fc2193dc | ||
|
|
dd732028f5 | ||
|
|
414919138b | ||
|
|
db7c8ca910 | ||
|
|
f863ffc965 | ||
|
|
400d483e87 | ||
|
|
d1695758b2 | ||
|
|
53a0cf8b95 | ||
|
|
5eeabc2a44 | ||
|
|
18551e820c | ||
|
|
e41e160263 | ||
|
|
b89fb2a4a1 | ||
|
|
5340b0e221 |
@@ -1,25 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install pytest \
|
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
|
||||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
|
||||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
|
||||||
@@ -15,13 +15,22 @@ remove_docker_container
|
|||||||
source /etc/environment
|
source /etc/environment
|
||||||
# Run a simple end-to-end example.
|
# Run a simple end-to-end example.
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
-e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install pytest \
|
&& python3 -m pip install pytest \
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
&& echo TEST_1 \
|
||||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
&& VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
&& echo TEST_2 \
|
||||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
&& echo TEST_3 \
|
||||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||||
|
&& echo TEST_4 \
|
||||||
|
&& VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& echo TEST_5 \
|
||||||
|
&& VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
|
||||||
|
|||||||
@@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .
|
|||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
|
docker rm -f "${container_name}" || true;
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
docker system prune -f || true;
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run \
|
docker run \
|
||||||
@@ -25,6 +26,6 @@ docker run \
|
|||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
sh -c '
|
sh -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||||
'
|
'
|
||||||
|
|||||||
@@ -1,11 +1,7 @@
|
|||||||
FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
|
# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
|
||||||
|
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
RUN rm /etc/apt/sources.list.d/intel-graphics.list
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get install -y --no-install-recommends --fix-missing \
|
apt-get install -y --no-install-recommends --fix-missing \
|
||||||
@@ -21,8 +17,6 @@ RUN apt-get update -y && \
|
|||||||
python3 \
|
python3 \
|
||||||
python3-dev \
|
python3-dev \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
libze-intel-gpu-dev \
|
|
||||||
libze-intel-gpu1 \
|
|
||||||
wget
|
wget
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
|||||||
@@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
|
|||||||
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||||
base_url = f"http://{args.host}:{args.port}"
|
base_url = f"http://{args.host}:{args.port}"
|
||||||
|
|
||||||
tokenizer = get_tokenizer(tokenizer_id,
|
tokenizer = get_tokenizer(
|
||||||
trust_remote_code=args.trust_remote_code)
|
tokenizer_id,
|
||||||
|
trust_remote_code=args.trust_remote_code,
|
||||||
|
tokenizer_mode=args.tokenizer_mode,
|
||||||
|
)
|
||||||
|
|
||||||
if args.dataset == 'grammar':
|
if args.dataset == 'grammar':
|
||||||
args.structure_type = 'guided_grammar'
|
args.structure_type = 'guided_grammar'
|
||||||
@@ -876,6 +879,13 @@ if __name__ == "__main__":
|
|||||||
help=
|
help=
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokenizer-mode",
|
||||||
|
type=str,
|
||||||
|
default="auto",
|
||||||
|
help=
|
||||||
|
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
type=int,
|
type=int,
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
|
|||||||
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
|
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
--result-filename "$FILENAME" \
|
--result-filename "$FILENAME" \
|
||||||
|
--tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
|
||||||
--port ${PORT:-8000}
|
--port ${PORT:-8000}
|
||||||
|
|
||||||
echo "Completed benchmark with QPS: $qps"
|
echo "Completed benchmark with QPS: $qps"
|
||||||
|
|||||||
@@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
|
|||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
// KV_T is the stored data type of kv-cache.
|
// KV_T is the data type of key and value tensors.
|
||||||
// CACHE_T is the data type of key and value tensors.
|
// CACHE_T is the stored data type of kv-cache.
|
||||||
// KV_DTYPE is the real data type of kv-cache.
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
@@ -393,8 +393,8 @@ void reshape_and_cache(
|
|||||||
CALL_RESHAPE_AND_CACHE)
|
CALL_RESHAPE_AND_CACHE)
|
||||||
}
|
}
|
||||||
|
|
||||||
// KV_T is the stored data type of kv-cache.
|
// KV_T is the data type of key and value tensors.
|
||||||
// CACHE_T is the data type of key and value tensors.
|
// CACHE_T is the stored data type of kv-cache.
|
||||||
// KV_DTYPE is the real data type of kv-cache.
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
@@ -446,8 +446,8 @@ void reshape_and_cache_flash(
|
|||||||
CALL_RESHAPE_AND_CACHE_FLASH);
|
CALL_RESHAPE_AND_CACHE_FLASH);
|
||||||
}
|
}
|
||||||
|
|
||||||
// KV_T is the stored data type of kv-cache.
|
// KV_T is the data type of key and value tensors.
|
||||||
// CACHE_T is the data type of key and value tensors.
|
// CACHE_T is the stored data type of kv-cache.
|
||||||
// KV_DTYPE is the real data type of kv-cache.
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
|
|||||||
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
|
|||||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||||
|
|
||||||
```Dockerfile
|
```Dockerfile
|
||||||
FROM vllm/vllm-openai:v0.7.3
|
FROM vllm/vllm-openai:v0.8.0
|
||||||
|
|
||||||
# e.g. install the `audio` and `video` optional dependencies
|
# e.g. install the `audio` and `video` optional dependencies
|
||||||
# NOTE: Make sure the version of vLLM matches the base image!
|
# NOTE: Make sure the version of vLLM matches the base image!
|
||||||
RUN uv pip install --system vllm[audio,video]==0.7.3
|
RUN uv pip install vllm[audio,video]==0.8.0
|
||||||
```
|
```
|
||||||
|
|
||||||
:::
|
:::
|
||||||
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
|
|||||||
```Dockerfile
|
```Dockerfile
|
||||||
FROM vllm/vllm-openai:latest
|
FROM vllm/vllm-openai:latest
|
||||||
|
|
||||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
RUN uv pip install git+https://github.com/huggingface/transformers.git
|
||||||
```
|
```
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|||||||
@@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we
|
|||||||
|
|
||||||
In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
|
In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
|
||||||
|
|
||||||
**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
|
**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
|
||||||
|
|
||||||
:::{image} /assets/design/v1/prefix_caching/example-time-1.png
|
:::{image} /assets/design/v1/prefix_caching/example-time-1.png
|
||||||
:alt: Example Time 1
|
:alt: Example Time 1
|
||||||
@@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
|
|||||||
:alt: Example Time 3
|
:alt: Example Time 3
|
||||||
:::
|
:::
|
||||||
|
|
||||||
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
|
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
||||||
|
|
||||||
:::{image} /assets/design/v1/prefix_caching/example-time-4.png
|
:::{image} /assets/design/v1/prefix_caching/example-time-4.png
|
||||||
:alt: Example Time 4
|
:alt: Example Time 4
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||||
|
|
||||||
|
To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
|
||||||
|
|
||||||
## Why vLLM V1?
|
## Why vLLM V1?
|
||||||
|
|
||||||
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
|
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
|
||||||
|
|||||||
@@ -768,7 +768,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
||||||
* ✅︎
|
* ✅︎
|
||||||
* ✅︎
|
* ✅︎
|
||||||
*
|
* ⚠️
|
||||||
- * `GLM4VForCausalLM`<sup>^</sup>
|
- * `GLM4VForCausalLM`<sup>^</sup>
|
||||||
* GLM-4V
|
* GLM-4V
|
||||||
* T + I
|
* T + I
|
||||||
@@ -884,7 +884,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
- * `PixtralForConditionalGeneration`
|
- * `PixtralForConditionalGeneration`
|
||||||
* Pixtral
|
* Pixtral
|
||||||
* T + I<sup>+</sup>
|
* T + I<sup>+</sup>
|
||||||
* `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
|
* `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
|
||||||
*
|
*
|
||||||
* ✅︎
|
* ✅︎
|
||||||
* ✅︎
|
* ✅︎
|
||||||
@@ -951,13 +951,10 @@ V0 correctly implements the model's attention pattern:
|
|||||||
|
|
||||||
V1 currently uses a simplified attention pattern:
|
V1 currently uses a simplified attention pattern:
|
||||||
- Uses causal attention for all tokens, including image tokens
|
- Uses causal attention for all tokens, including image tokens
|
||||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
|
||||||
- Will be updated in the future to support the correct behavior
|
- Will be updated in the future to support the correct behavior
|
||||||
- Does not support `"do_pan_and_scan": True`
|
|
||||||
|
|
||||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||||
|
|
||||||
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
:::{note}
|
:::{note}
|
||||||
|
|||||||
@@ -6,14 +6,14 @@ import argparse
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
|
||||||
# This script is an offline demo for running Pixtral.
|
# This script is an offline demo for running Mistral-Small-3
|
||||||
#
|
#
|
||||||
# If you want to run a server/client setup, please follow this code:
|
# If you want to run a server/client setup, please follow this code:
|
||||||
#
|
#
|
||||||
# - Server:
|
# - Server:
|
||||||
#
|
#
|
||||||
# ```bash
|
# ```bash
|
||||||
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||||
# ```
|
# ```
|
||||||
#
|
#
|
||||||
# - Client:
|
# - Client:
|
||||||
@@ -23,7 +23,7 @@ from vllm.sampling_params import SamplingParams
|
|||||||
# --header 'Content-Type: application/json' \
|
# --header 'Content-Type: application/json' \
|
||||||
# --header 'Authorization: Bearer token' \
|
# --header 'Authorization: Bearer token' \
|
||||||
# --data '{
|
# --data '{
|
||||||
# "model": "mistralai/Pixtral-12B-2409",
|
# "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
|
||||||
# "messages": [
|
# "messages": [
|
||||||
# {
|
# {
|
||||||
# "role": "user",
|
# "role": "user",
|
||||||
@@ -44,7 +44,7 @@ from vllm.sampling_params import SamplingParams
|
|||||||
|
|
||||||
|
|
||||||
def run_simple_demo(args: argparse.Namespace):
|
def run_simple_demo(args: argparse.Namespace):
|
||||||
model_name = "mistralai/Pixtral-12B-2409"
|
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||||
sampling_params = SamplingParams(max_tokens=8192)
|
sampling_params = SamplingParams(max_tokens=8192)
|
||||||
|
|
||||||
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
|
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
|
||||||
@@ -83,7 +83,7 @@ def run_simple_demo(args: argparse.Namespace):
|
|||||||
|
|
||||||
|
|
||||||
def run_advanced_demo(args: argparse.Namespace):
|
def run_advanced_demo(args: argparse.Namespace):
|
||||||
model_name = "mistralai/Pixtral-12B-2409"
|
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||||
max_img_per_msg = 5
|
max_img_per_msg = 5
|
||||||
max_tokens_per_img = 4096
|
max_tokens_per_img = 4096
|
||||||
|
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ class TestSetting:
|
|||||||
# embedding model
|
# embedding model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
model="BAAI/bge-multilingual-gemma2",
|
model="BAAI/bge-multilingual-gemma2",
|
||||||
model_args=["--task", "embed"],
|
model_args=["--task", "embed", "--dtype", "bfloat16"],
|
||||||
pp_size=1,
|
pp_size=1,
|
||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
|
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
||||||
BatchFeature)
|
BatchEncoding, BatchFeature)
|
||||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||||
|
|
||||||
from tests.models.utils import (TokensTextLogprobs,
|
from tests.models.utils import (TokensTextLogprobs,
|
||||||
@@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
|
|||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import TaskOption, TokenizerPoolConfig
|
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
|
||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||||
init_distributed_environment,
|
init_distributed_environment,
|
||||||
@@ -34,8 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
|
from vllm.utils import cuda_device_count_stateless, is_list_of
|
||||||
identity, is_list_of)
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@@ -271,14 +270,18 @@ _R = TypeVar("_R")
|
|||||||
|
|
||||||
class HfRunner:
|
class HfRunner:
|
||||||
|
|
||||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
def get_default_device(self):
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
return ("cpu" if current_platform.is_cpu()
|
||||||
|
or current_platform.is_openvino() else "cuda")
|
||||||
|
|
||||||
|
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||||
if x is None or isinstance(x, (bool, )):
|
if x is None or isinstance(x, (bool, )):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
if device is None:
|
if device is None:
|
||||||
device = "cpu" if current_platform.is_cpu(
|
device = self.device
|
||||||
) or current_platform.is_openvino() else "cuda"
|
|
||||||
|
|
||||||
if isinstance(x, dict):
|
if isinstance(x, dict):
|
||||||
return {k: self.wrap_device(v, device) for k, v in x.items()}
|
return {k: self.wrap_device(v, device) for k, v in x.items()}
|
||||||
@@ -291,45 +294,59 @@ class HfRunner:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
dtype: str = "half",
|
dtype: str = "auto",
|
||||||
*,
|
*,
|
||||||
model_kwargs: Optional[dict[str, Any]] = None,
|
model_kwargs: Optional[dict[str, Any]] = None,
|
||||||
is_sentence_transformer: bool = False,
|
is_sentence_transformer: bool = False,
|
||||||
is_cross_encoder: bool = False,
|
is_cross_encoder: bool = False,
|
||||||
skip_tokenizer_init: bool = False,
|
skip_tokenizer_init: bool = False,
|
||||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||||
postprocess_inputs: Callable[..., BatchEncoding] = identity,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
|
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
|
self.config = AutoConfig.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
self.device = self.get_default_device()
|
||||||
|
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
|
||||||
|
|
||||||
|
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
||||||
|
model_kwargs.setdefault("torch_dtype", torch_dtype)
|
||||||
|
|
||||||
if is_sentence_transformer:
|
if is_sentence_transformer:
|
||||||
# Lazy init required for AMD CI
|
# Lazy init required for AMD CI
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
self.model = self.wrap_device(
|
|
||||||
SentenceTransformer(
|
self.model = SentenceTransformer(
|
||||||
model_name,
|
model_name,
|
||||||
device="cpu",
|
device=self.device,
|
||||||
trust_remote_code=True,
|
model_kwargs=model_kwargs,
|
||||||
).to(dtype=torch_dtype))
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
elif is_cross_encoder:
|
elif is_cross_encoder:
|
||||||
# Lazy init required for AMD CI
|
# Lazy init required for AMD CI
|
||||||
from sentence_transformers import CrossEncoder
|
from sentence_transformers import CrossEncoder
|
||||||
self.model = CrossEncoder(model_name,
|
|
||||||
device="cpu",
|
self.model = CrossEncoder(
|
||||||
trust_remote_code=True)
|
model_name,
|
||||||
self.model.model = self.wrap_device(self.model.model)\
|
device=self.device,
|
||||||
.to(dtype=torch_dtype)
|
automodel_args=model_kwargs,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
model = auto_cls.from_pretrained(
|
||||||
self.model = self.wrap_device(
|
model_name,
|
||||||
auto_cls.from_pretrained(
|
trust_remote_code=True,
|
||||||
model_name,
|
**model_kwargs,
|
||||||
torch_dtype=torch_dtype,
|
)
|
||||||
trust_remote_code=True,
|
|
||||||
**model_kwargs,
|
if (getattr(model, "quantization_method", None) != "bitsandbytes"
|
||||||
))
|
and len({p.device
|
||||||
|
for p in model.parameters()}) < 2):
|
||||||
|
model = model.to(self.device)
|
||||||
|
|
||||||
|
self.model = model
|
||||||
|
|
||||||
if not skip_tokenizer_init:
|
if not skip_tokenizer_init:
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
@@ -349,16 +366,13 @@ class HfRunner:
|
|||||||
if skip_tokenizer_init:
|
if skip_tokenizer_init:
|
||||||
self.tokenizer = self.processor.tokenizer
|
self.tokenizer = self.processor.tokenizer
|
||||||
|
|
||||||
self.dtype = dtype
|
|
||||||
self.postprocess_inputs = postprocess_inputs
|
|
||||||
|
|
||||||
def get_inputs(
|
def get_inputs(
|
||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: Optional[PromptImageInput] = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: Optional[PromptVideoInput] = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: Optional[PromptAudioInput] = None,
|
||||||
) -> list[BatchEncoding]:
|
) -> list[Union[BatchFeature, BatchEncoding]]:
|
||||||
if images is not None:
|
if images is not None:
|
||||||
assert len(prompts) == len(images)
|
assert len(prompts) == len(images)
|
||||||
|
|
||||||
@@ -368,7 +382,7 @@ class HfRunner:
|
|||||||
if audios is not None:
|
if audios is not None:
|
||||||
assert len(prompts) == len(audios)
|
assert len(prompts) == len(audios)
|
||||||
|
|
||||||
all_inputs: list[BatchEncoding] = []
|
all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
|
||||||
for i, prompt in enumerate(prompts):
|
for i, prompt in enumerate(prompts):
|
||||||
processor_kwargs: dict[str, Any] = {
|
processor_kwargs: dict[str, Any] = {
|
||||||
"text": prompt,
|
"text": prompt,
|
||||||
@@ -384,7 +398,8 @@ class HfRunner:
|
|||||||
processor_kwargs["sampling_rate"] = sr
|
processor_kwargs["sampling_rate"] = sr
|
||||||
|
|
||||||
inputs = self.processor(**processor_kwargs)
|
inputs = self.processor(**processor_kwargs)
|
||||||
inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
|
if isinstance(inputs, BatchFeature):
|
||||||
|
inputs = inputs.to(dtype=self.dtype)
|
||||||
|
|
||||||
all_inputs.append(inputs)
|
all_inputs.append(inputs)
|
||||||
|
|
||||||
@@ -417,7 +432,7 @@ class HfRunner:
|
|||||||
outputs: list[tuple[list[list[int]], list[str]]] = []
|
outputs: list[tuple[list[list[int]], list[str]]] = []
|
||||||
for inputs in all_inputs:
|
for inputs in all_inputs:
|
||||||
output_ids = self.model.generate(
|
output_ids = self.model.generate(
|
||||||
**self.wrap_device(inputs, device=self.model.device.type),
|
**self.wrap_device(inputs),
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
@@ -488,7 +503,7 @@ class HfRunner:
|
|||||||
all_logprobs: list[list[torch.Tensor]] = []
|
all_logprobs: list[list[torch.Tensor]] = []
|
||||||
for inputs in all_inputs:
|
for inputs in all_inputs:
|
||||||
output = self.model.generate(
|
output = self.model.generate(
|
||||||
**self.wrap_device(inputs, device=self.model.device.type),
|
**self.wrap_device(inputs),
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
max_new_tokens=max_tokens,
|
max_new_tokens=max_tokens,
|
||||||
@@ -569,7 +584,7 @@ class HfRunner:
|
|||||||
|
|
||||||
for inputs in all_inputs:
|
for inputs in all_inputs:
|
||||||
output = self.model.generate(
|
output = self.model.generate(
|
||||||
**self.wrap_device(inputs, device=self.model.device.type),
|
**self.wrap_device(inputs),
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
do_sample=False,
|
do_sample=False,
|
||||||
max_new_tokens=max_tokens,
|
max_new_tokens=max_tokens,
|
||||||
@@ -620,19 +635,15 @@ class HfRunner:
|
|||||||
if images is not None and images[i] is not None:
|
if images is not None and images[i] is not None:
|
||||||
processor_kwargs["images"] = images[i]
|
processor_kwargs["images"] = images[i]
|
||||||
|
|
||||||
encoder_inputs = self.wrap_device(
|
encoder_inputs = self.processor(**processor_kwargs)
|
||||||
self.processor(**processor_kwargs),
|
encoder_inputs = self.wrap_device(encoder_inputs)
|
||||||
device=self.model.device.type,
|
|
||||||
)
|
|
||||||
|
|
||||||
if decoder_prompt is None:
|
if decoder_prompt is None:
|
||||||
decoder_input_ids = None
|
decoder_input_ids = None
|
||||||
else:
|
else:
|
||||||
decoder_input_ids = self.wrap_device(
|
decoder_inputs = self.tokenizer(decoder_prompt,
|
||||||
self.tokenizer(decoder_prompt,
|
return_tensors="pt")
|
||||||
return_tensors="pt").input_ids,
|
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
|
||||||
device=self.model.device.type,
|
|
||||||
)
|
|
||||||
|
|
||||||
output = self.model.generate(
|
output = self.model.generate(
|
||||||
decoder_input_ids=decoder_input_ids,
|
decoder_input_ids=decoder_input_ids,
|
||||||
@@ -684,6 +695,7 @@ class VllmRunner:
|
|||||||
"""
|
"""
|
||||||
The default value of some arguments have been modified from
|
The default value of some arguments have been modified from
|
||||||
:class:`~vllm.LLM` as follows:
|
:class:`~vllm.LLM` as follows:
|
||||||
|
|
||||||
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
||||||
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
||||||
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
|
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
|
||||||
@@ -701,10 +713,8 @@ class VllmRunner:
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
seed: Optional[int] = 0,
|
seed: Optional[int] = 0,
|
||||||
# Use smaller max model length, otherwise bigger model cannot run due
|
|
||||||
# to kv cache size limit.
|
|
||||||
max_model_len: int = 1024,
|
max_model_len: int = 1024,
|
||||||
dtype: str = "half",
|
dtype: str = "auto",
|
||||||
disable_log_stats: bool = True,
|
disable_log_stats: bool = True,
|
||||||
tensor_parallel_size: int = 1,
|
tensor_parallel_size: int = 1,
|
||||||
block_size: int = 16,
|
block_size: int = 16,
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ def test_multi_chat():
|
|||||||
def test_chat_multi_image(image_urls: list[str]):
|
def test_chat_multi_image(image_urls: list[str]):
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
dtype="bfloat16",
|
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=5,
|
max_num_seqs=5,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
|
|||||||
@@ -18,8 +18,6 @@ TEST_AUDIO_URLS = [
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
args = [
|
args = [
|
||||||
"--dtype",
|
|
||||||
"bfloat16",
|
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"2048",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
|
|||||||
@@ -24,8 +24,6 @@ def server():
|
|||||||
args = [
|
args = [
|
||||||
"--task",
|
"--task",
|
||||||
"generate",
|
"generate",
|
||||||
"--dtype",
|
|
||||||
"bfloat16",
|
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"32768",
|
"32768",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
|
|||||||
@@ -25,8 +25,6 @@ def server():
|
|||||||
args = [
|
args = [
|
||||||
"--task",
|
"--task",
|
||||||
"generate",
|
"generate",
|
||||||
"--dtype",
|
|
||||||
"bfloat16",
|
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"2048",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
|
|||||||
@@ -28,8 +28,6 @@ def server():
|
|||||||
args = [
|
args = [
|
||||||
"--task",
|
"--task",
|
||||||
"embed",
|
"embed",
|
||||||
"--dtype",
|
|
||||||
"bfloat16",
|
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"2048",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ def phi3v_model_config():
|
|||||||
tokenizer=PHI3V_MODEL_ID,
|
tokenizer=PHI3V_MODEL_ID,
|
||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
dtype="bfloat16",
|
dtype="auto",
|
||||||
seed=0,
|
seed=0,
|
||||||
limit_mm_per_prompt={
|
limit_mm_per_prompt={
|
||||||
"image": 2,
|
"image": 2,
|
||||||
@@ -58,7 +58,7 @@ def mllama_model_config():
|
|||||||
tokenizer=MLLAMA_MODEL_ID,
|
tokenizer=MLLAMA_MODEL_ID,
|
||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
dtype="bfloat16",
|
dtype="auto",
|
||||||
seed=0,
|
seed=0,
|
||||||
limit_mm_per_prompt={
|
limit_mm_per_prompt={
|
||||||
"image": 2,
|
"image": 2,
|
||||||
@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
|||||||
tokenizer=MLLAMA_MODEL_ID,
|
tokenizer=MLLAMA_MODEL_ID,
|
||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
dtype="bfloat16",
|
dtype="auto",
|
||||||
seed=0,
|
seed=0,
|
||||||
limit_mm_per_prompt={
|
limit_mm_per_prompt={
|
||||||
"image": 2,
|
"image": 2,
|
||||||
|
|||||||
@@ -5,11 +5,10 @@ from typing import Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
from vllm.multimodal.audio import resample_audio
|
from vllm.multimodal.audio import resample_audio
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
from ....conftest import HfRunner, VllmRunner
|
from ....conftest import HfRunner, VllmRunner
|
||||||
from ....utils import RemoteOpenAIServer
|
from ....utils import RemoteOpenAIServer
|
||||||
@@ -107,8 +106,6 @@ def run_test(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Inference result should be the same between hf and vllm."""
|
"""Inference result should be the same between hf and vllm."""
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
@@ -124,15 +121,7 @@ def run_test(
|
|||||||
for vllm_prompt, _, audio in prompts_and_audios
|
for vllm_prompt, _, audio in prompts_and_audios
|
||||||
]
|
]
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding, **kwargs):
|
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||||
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
|
|
||||||
.to(torch_dtype) # type: ignore
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
|
||||||
dtype=dtype,
|
|
||||||
postprocess_inputs=process,
|
|
||||||
auto_cls=AutoModel) as hf_model:
|
|
||||||
hf_outputs_per_audio = [
|
hf_outputs_per_audio = [
|
||||||
hf_model.generate_greedy_logprobs_limit(
|
hf_model.generate_greedy_logprobs_limit(
|
||||||
[hf_prompt],
|
[hf_prompt],
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from pathlib import PosixPath
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from packaging.version import Version
|
from packaging.version import Version
|
||||||
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
|
from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
|
||||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[CustomTestOptions(
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
"stop_sign": "caption es",
|
"stop_sign": "caption es",
|
||||||
"cherry_blossom": "What is in the picture?",
|
"cherry_blossom": "What is in the picture?",
|
||||||
}),
|
}),
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
|
||||||
"pixel_values"
|
|
||||||
),
|
|
||||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
|
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
|
||||||
@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||||
# }),
|
# }),
|
||||||
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||||
# postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
|
|
||||||
# stop_str=["<|im_end|>"],
|
# stop_str=["<|im_end|>"],
|
||||||
# image_size_factors=[(0.10, 0.15)],
|
# image_size_factors=[(0.10, 0.15)],
|
||||||
# max_tokens=64,
|
# max_tokens=64,
|
||||||
@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=VLMTestType.IMAGE,
|
test_type=VLMTestType.IMAGE,
|
||||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||||
img_idx_to_prompt=lambda idx: "",
|
img_idx_to_prompt=lambda idx: "",
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
||||||
),
|
),
|
||||||
"chameleon": VLMTestInfo(
|
"chameleon": VLMTestInfo(
|
||||||
@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
|
||||||
"pixel_values"
|
|
||||||
),
|
|
||||||
# For chameleon, we only compare the sequences
|
# For chameleon, we only compare the sequences
|
||||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||||
@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
}),
|
}),
|
||||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
|
|
||||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
||||||
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||||
@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "",
|
img_idx_to_prompt=lambda idx: "",
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
|
auto_cls=AutoModelForImageTextToText,
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||||
num_logprobs=10,
|
num_logprobs=10,
|
||||||
@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
# TODO: Use AutoModelForVision2Seq once transformers supports this
|
auto_cls=AutoModelForImageTextToText,
|
||||||
auto_cls=AutoModelForPreTraining,
|
|
||||||
dtype="bfloat16",
|
|
||||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||||
),
|
),
|
||||||
@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
}),
|
}),
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
dtype="bfloat16",
|
|
||||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||||
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
|
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
|
||||||
# The image embeddings match with HF but the outputs of the language
|
# The image embeddings match with HF but the outputs of the language
|
||||||
@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
}),
|
}),
|
||||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype="bfloat16",
|
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
num_logprobs=10,
|
num_logprobs=10,
|
||||||
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
|
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
|
||||||
@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "<image>",
|
img_idx_to_prompt=lambda idx: "<image>",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
||||||
),
|
),
|
||||||
"intern_vl": VLMTestInfo(
|
"intern_vl": VLMTestInfo(
|
||||||
@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
}),
|
}),
|
||||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
# NOTE: Mono-InternVL-2B doesn't work with fp16,
|
|
||||||
# it will result NaN during inference.
|
|
||||||
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
|
|
||||||
dtype="bfloat16",
|
|
||||||
use_tokenizer_eos=True,
|
use_tokenizer_eos=True,
|
||||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||||
),
|
),
|
||||||
@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||||
max_model_len=10240,
|
max_model_len=10240,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[CustomTestOptions(
|
||||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||||
@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||||
num_video_frames=16,
|
num_video_frames=16,
|
||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
|
||||||
"pixel_values_videos"
|
|
||||||
),
|
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForVision2Seq,
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[CustomTestOptions(
|
||||||
@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||||
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
|
||||||
"pixel_values"
|
|
||||||
),
|
|
||||||
get_stop_token_ids=lambda tok: [128009],
|
get_stop_token_ids=lambda tok: [128009],
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
||||||
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
||||||
marks=[
|
marks=[
|
||||||
@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
||||||
postprocess_inputs=model_utils.wrap_inputs_post_processor,
|
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
|
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
|
||||||
),
|
),
|
||||||
"minicpmo_26": VLMTestInfo(
|
"minicpmo_26": VLMTestInfo(
|
||||||
models=["openbmb/MiniCPM-o-2_6"],
|
models=["openbmb/MiniCPM-o-2_6"],
|
||||||
@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||||
postprocess_inputs=model_utils.ignore_inputs_post_processor(
|
|
||||||
"image_sizes"
|
|
||||||
),
|
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
patch_hf_runner=model_utils.minicpmo_patch_hf_runner
|
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||||
),
|
),
|
||||||
"minicpmv_26": VLMTestInfo(
|
"minicpmv_26": VLMTestInfo(
|
||||||
models=["openbmb/MiniCPM-V-2_6"],
|
models=["openbmb/MiniCPM-V-2_6"],
|
||||||
@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||||
postprocess_inputs=model_utils.ignore_inputs_post_processor(
|
|
||||||
"image_sizes"
|
|
||||||
),
|
|
||||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||||
|
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||||
),
|
),
|
||||||
"molmo": VLMTestInfo(
|
"molmo": VLMTestInfo(
|
||||||
models=["allenai/Molmo-7B-D-0924"],
|
models=["allenai/Molmo-7B-D-0924"],
|
||||||
@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
patch_hf_runner=model_utils.molmo_patch_hf_runner,
|
patch_hf_runner=model_utils.molmo_patch_hf_runner,
|
||||||
postprocess_inputs=model_utils.molmo_post_processor,
|
|
||||||
),
|
),
|
||||||
# Tests for phi3v currently live in another file because of a bug in
|
# Tests for phi3v currently live in another file because of a bug in
|
||||||
# transformers. Once this issue is fixed, we can enable them here instead.
|
# transformers. Once this issue is fixed, we can enable them here instead.
|
||||||
@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
img_idx_to_prompt=lambda idx: "[IMG]",
|
img_idx_to_prompt=lambda idx: "[IMG]",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
marks=[large_gpu_mark(min_gb=48)],
|
marks=[large_gpu_mark(min_gb=48)],
|
||||||
),
|
),
|
||||||
"qwen_vl": VLMTestInfo(
|
"qwen_vl": VLMTestInfo(
|
||||||
@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["facebook/chameleon-7b"],
|
models=["facebook/chameleon-7b"],
|
||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
|
||||||
"pixel_values"
|
|
||||||
),
|
|
||||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||||
comparator=check_outputs_equal,
|
comparator=check_outputs_equal,
|
||||||
@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["llava-hf/llava-1.5-7b-hf"],
|
models=["llava-hf/llava-1.5-7b-hf"],
|
||||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
|
|||||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||||
max_model_len=10240,
|
max_model_len=10240,
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForImageTextToText,
|
||||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||||
marks=multi_gpu_marks(num_gpus=2),
|
marks=multi_gpu_marks(num_gpus=2),
|
||||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||||
@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
|
|||||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||||
max_model_len=16384,
|
max_model_len=16384,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
|
||||||
"pixel_values"
|
|
||||||
),
|
|
||||||
auto_cls=AutoModelForVision2Seq,
|
auto_cls=AutoModelForVision2Seq,
|
||||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||||
custom_test_opts=[CustomTestOptions(
|
custom_test_opts=[CustomTestOptions(
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
Run `pytest tests/models/test_mistral.py`.
|
Run `pytest tests/models/test_mistral.py`.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
import uuid
|
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from typing import TYPE_CHECKING, Any, Optional
|
from typing import TYPE_CHECKING, Any, Optional
|
||||||
|
|
||||||
@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|||||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||||
from transformers import AutoProcessor
|
from transformers import AutoProcessor
|
||||||
|
|
||||||
from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
|
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
|
||||||
TextPrompt, TokensPrompt)
|
|
||||||
from vllm.multimodal import MultiModalDataBuiltins
|
from vllm.multimodal import MultiModalDataBuiltins
|
||||||
from vllm.multimodal.inputs import PlaceholderRange
|
from vllm.multimodal.inputs import PlaceholderRange
|
||||||
from vllm.sequence import Logprob, SampleLogprobs
|
from vllm.sequence import Logprob, SampleLogprobs
|
||||||
@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from _typeshed import StrPath
|
from _typeshed import StrPath
|
||||||
|
|
||||||
MODELS = ["mistralai/Pixtral-12B-2409"]
|
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
|
||||||
|
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||||
|
|
||||||
|
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
|
||||||
|
|
||||||
IMG_URLS = [
|
IMG_URLS = [
|
||||||
"https://picsum.photos/id/237/400/300",
|
"https://picsum.photos/id/237/400/300",
|
||||||
"https://picsum.photos/id/231/200/300",
|
"https://picsum.photos/id/231/200/300",
|
||||||
@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
|
|||||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||||
assert FIXTURES_PATH.exists()
|
assert FIXTURES_PATH.exists()
|
||||||
|
|
||||||
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
|
FIXTURE_LOGPROBS_CHAT = {
|
||||||
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
|
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
|
||||||
|
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
|
||||||
|
}
|
||||||
|
|
||||||
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
|
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
|
||||||
|
|
||||||
@@ -166,12 +170,12 @@ def test_chat(
|
|||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
|
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
|
||||||
|
FIXTURE_LOGPROBS_CHAT[model])
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
tokenizer_mode="mistral",
|
tokenizer_mode="mistral",
|
||||||
enable_chunked_prefill=False,
|
|
||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
@@ -183,70 +187,40 @@ def test_chat(
|
|||||||
outputs.extend(output)
|
outputs.extend(output)
|
||||||
|
|
||||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||||
|
# Remove last `None` prompt_logprobs to compare with fixture
|
||||||
|
for i in range(len(logprobs)):
|
||||||
|
assert logprobs[i][-1] is None
|
||||||
|
logprobs[i] = logprobs[i][:-1]
|
||||||
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||||
outputs_1_lst=logprobs,
|
outputs_1_lst=logprobs,
|
||||||
name_0="h100_ref",
|
name_0="h100_ref",
|
||||||
name_1="output")
|
name_1="output")
|
||||||
|
|
||||||
|
|
||||||
@large_gpu_test(min_gb=80)
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
|
||||||
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
|
|
||||||
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
|
|
||||||
args = EngineArgs(
|
|
||||||
model=model,
|
|
||||||
tokenizer_mode="mistral",
|
|
||||||
enable_chunked_prefill=False,
|
|
||||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
|
||||||
dtype=dtype,
|
|
||||||
)
|
|
||||||
engine = LLMEngine.from_engine_args(args)
|
|
||||||
|
|
||||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
|
|
||||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
|
|
||||||
|
|
||||||
outputs = []
|
|
||||||
count = 0
|
|
||||||
while True:
|
|
||||||
out = engine.step()
|
|
||||||
count += 1
|
|
||||||
for request_output in out:
|
|
||||||
if request_output.finished:
|
|
||||||
outputs.append(request_output)
|
|
||||||
|
|
||||||
if count == 2:
|
|
||||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
|
|
||||||
SAMPLING_PARAMS)
|
|
||||||
if not engine.has_unfinished_requests():
|
|
||||||
break
|
|
||||||
|
|
||||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
|
||||||
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
|
|
||||||
outputs_1_lst=logprobs,
|
|
||||||
name_0="h100_ref",
|
|
||||||
name_1="output")
|
|
||||||
|
|
||||||
|
|
||||||
@large_gpu_test(min_gb=48)
|
@large_gpu_test(min_gb=48)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"prompt,expected_ranges",
|
"prompt,expected_ranges",
|
||||||
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
|
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
|
||||||
"offset": 10,
|
"offset": 11,
|
||||||
"length": 494
|
"length": 494
|
||||||
}]),
|
}]),
|
||||||
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
|
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
|
||||||
"offset": 10,
|
"offset": 11,
|
||||||
"length": 266
|
"length": 266
|
||||||
}, {
|
}, {
|
||||||
"offset": 276,
|
"offset": 277,
|
||||||
"length": 1056
|
"length": 1056
|
||||||
}, {
|
}, {
|
||||||
"offset": 1332,
|
"offset": 1333,
|
||||||
"length": 418
|
"length": 418
|
||||||
}])])
|
}])])
|
||||||
def test_multi_modal_placeholders(
|
def test_multi_modal_placeholders(vllm_runner, prompt,
|
||||||
vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
|
expected_ranges: list[PlaceholderRange],
|
||||||
|
monkeypatch) -> None:
|
||||||
|
|
||||||
|
# This placeholder checking test only works with V0 engine
|
||||||
|
# where `multi_modal_placeholders` is returned with `RequestOutput`
|
||||||
|
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
"mistral-community/pixtral-12b",
|
"mistral-community/pixtral-12b",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from transformers import BatchEncoding
|
|
||||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||||
|
|
||||||
from vllm.config import TaskOption
|
from vllm.config import TaskOption
|
||||||
@@ -31,7 +30,6 @@ def run_test(
|
|||||||
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||||
auto_cls: type[_BaseAutoModelClass],
|
auto_cls: type[_BaseAutoModelClass],
|
||||||
use_tokenizer_eos: bool,
|
use_tokenizer_eos: bool,
|
||||||
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
|
|
||||||
comparator: Callable[..., None],
|
comparator: Callable[..., None],
|
||||||
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
|
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
|
||||||
stop_str: Optional[list[str]],
|
stop_str: Optional[list[str]],
|
||||||
@@ -101,7 +99,6 @@ def run_test(
|
|||||||
hf_model = hf_runner(model,
|
hf_model = hf_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
auto_cls=auto_cls,
|
auto_cls=auto_cls,
|
||||||
postprocess_inputs=postprocess_inputs,
|
|
||||||
model_kwargs=hf_model_kwargs)
|
model_kwargs=hf_model_kwargs)
|
||||||
|
|
||||||
# Some models need to patch things like the model processor, e.g., internvl
|
# Some models need to patch things like the model processor, e.g., internvl
|
||||||
|
|||||||
@@ -6,16 +6,15 @@ typically specific to a small subset of models.
|
|||||||
import re
|
import re
|
||||||
import types
|
import types
|
||||||
from pathlib import PosixPath
|
from pathlib import PosixPath
|
||||||
from typing import Callable, Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
|
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||||
GenerationConfig)
|
GenerationConfig)
|
||||||
|
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.transformers_utils.tokenizer import patch_padding_side
|
from vllm.transformers_utils.tokenizer import patch_padding_side
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
from .....conftest import HfRunner, ImageAsset, _ImageAssets
|
from .....conftest import HfRunner, ImageAsset, _ImageAssets
|
||||||
from .types import RunnerOutput
|
from .types import RunnerOutput
|
||||||
@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
|
|||||||
return [asset.image_embeds for asset in image_assets]
|
return [asset.image_embeds for asset in image_assets]
|
||||||
|
|
||||||
|
|
||||||
####### postprocessors to run on HF BatchEncoding
|
|
||||||
def cast_dtype_post_processor(
|
|
||||||
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
|
|
||||||
"""Gets a handle to a post processor which converts a given key into a
|
|
||||||
target data type."""
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding, dtype: str):
|
|
||||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
|
||||||
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
return process
|
|
||||||
|
|
||||||
|
|
||||||
def ignore_inputs_post_processor(
|
|
||||||
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
|
|
||||||
"""Gets a handle to a post processor which ignores a given key."""
|
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding, dtype: str):
|
|
||||||
del hf_inputs[hf_inp_key]
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
return process
|
|
||||||
|
|
||||||
|
|
||||||
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
|
|
||||||
return {"model_inputs": hf_inputs}
|
|
||||||
|
|
||||||
|
|
||||||
def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
|
|
||||||
hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
|
|
||||||
return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
|
|
||||||
|
|
||||||
|
|
||||||
####### Prompt path encoders for models that need models on disk
|
####### Prompt path encoders for models that need models on disk
|
||||||
def qwen_prompt_path_encoder(
|
def qwen_prompt_path_encoder(
|
||||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
|
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
|
||||||
@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
|||||||
for k in inputs.keys() # noqa
|
for k in inputs.keys() # noqa
|
||||||
if k not in ("seq_lens", "sft_format")
|
if k not in ("seq_lens", "sft_format")
|
||||||
}
|
}
|
||||||
inputs = BatchEncoding(data=inputs, tensor_type="pt")
|
return BatchFeature(data=inputs, tensor_type="pt")
|
||||||
return inputs
|
|
||||||
|
|
||||||
hf_model.processor = processor
|
hf_model.processor = processor
|
||||||
hf_model.model.get_output_embeddings = lambda: \
|
hf_model.model.get_output_embeddings = lambda: \
|
||||||
@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
|||||||
return hf_model
|
return hf_model
|
||||||
|
|
||||||
|
|
||||||
def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||||
orig_generate = hf_model.model.generate
|
orig_generate = hf_model.model.generate
|
||||||
|
|
||||||
def _generate(self, *args, **kwargs):
|
def _generate(
|
||||||
|
self,
|
||||||
|
*args,
|
||||||
|
input_ids=None,
|
||||||
|
pixel_values=None,
|
||||||
|
image_sizes=None,
|
||||||
|
image_bound=None,
|
||||||
|
tgt_sizes=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
model_inputs = {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"pixel_values": pixel_values,
|
||||||
|
"image_sizes": image_sizes,
|
||||||
|
"image_bound": image_bound,
|
||||||
|
"tgt_sizes": tgt_sizes,
|
||||||
|
}
|
||||||
|
for k in list(model_inputs.keys()):
|
||||||
|
if model_inputs[k] is None:
|
||||||
|
model_inputs.pop(k)
|
||||||
|
|
||||||
|
return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
|
||||||
|
|
||||||
|
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||||
|
|
||||||
|
return hf_model
|
||||||
|
|
||||||
|
|
||||||
|
def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||||
|
orig_generate = hf_model.model.generate
|
||||||
|
|
||||||
|
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||||
|
return orig_generate(*args, decode_text=False, **kwargs)
|
||||||
|
|
||||||
|
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||||
|
|
||||||
|
return hf_model
|
||||||
|
|
||||||
|
|
||||||
|
def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||||
|
orig_generate = hf_model.model.generate
|
||||||
|
|
||||||
|
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||||
return orig_generate(*args, decode_text=False, **kwargs)
|
return orig_generate(*args, decode_text=False, **kwargs)
|
||||||
|
|
||||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||||
@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
|||||||
|
|
||||||
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
|
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
|
||||||
batch = {
|
batch = {
|
||||||
k: kwargs.pop(k)
|
k: kwargs.pop(k).unsqueeze(0)
|
||||||
for k in ("input_ids", "images", "image_input_idx", "image_masks")
|
for k in ("input_ids", "images", "image_input_idx", "image_masks")
|
||||||
if k in kwargs
|
if k in kwargs
|
||||||
}
|
}
|
||||||
|
batch = BatchFeature(batch).to(dtype=self.dtype)
|
||||||
|
|
||||||
return self.generate_from_batch(
|
return self.generate_from_batch(
|
||||||
batch,
|
batch,
|
||||||
|
|||||||
@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
|
|||||||
import torch
|
import torch
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pytest import MarkDecorator
|
from pytest import MarkDecorator
|
||||||
from transformers import AutoModelForCausalLM, BatchEncoding
|
from transformers import AutoModelForCausalLM
|
||||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||||
|
|
||||||
from vllm.config import TaskOption
|
from vllm.config import TaskOption
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils import identity
|
|
||||||
|
|
||||||
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
|
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
|
||||||
from ....utils import check_logprobs_close
|
from ....utils import check_logprobs_close
|
||||||
@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
|
|||||||
# Indicates we should explicitly pass the EOS from the tokenizer
|
# Indicates we should explicitly pass the EOS from the tokenizer
|
||||||
use_tokenizer_eos: bool = False
|
use_tokenizer_eos: bool = False
|
||||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
|
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||||
# Callable to pass to the HF runner to run on inputs; for now, we also pass
|
|
||||||
# the data type to input post processing, because almost all of the uses of
|
|
||||||
# postprocess_inputs are to fix the data types of BatchEncoding values.
|
|
||||||
postprocess_inputs: Callable[[BatchEncoding, str],
|
|
||||||
BatchEncoding] = identity
|
|
||||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
|
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
|
||||||
|
|
||||||
# Post processors that if defined, will run oun the outputs of the
|
# Post processors that if defined, will run oun the outputs of the
|
||||||
@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
|
|||||||
# is all combinations of .models + all fields below
|
# is all combinations of .models + all fields below
|
||||||
max_tokens: Union[int, tuple[int]] = 128
|
max_tokens: Union[int, tuple[int]] = 128
|
||||||
num_logprobs: Union[int, tuple[int]] = 5
|
num_logprobs: Union[int, tuple[int]] = 5
|
||||||
dtype: Union[str, Iterable[str]] = "half"
|
dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
|
||||||
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
||||||
# Only expanded in video tests
|
# Only expanded in video tests
|
||||||
num_video_frames: Union[int, tuple[int]] = 16
|
num_video_frames: Union[int, tuple[int]] = 16
|
||||||
@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
|
|||||||
"vllm_output_post_proc": self.vllm_output_post_proc,
|
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||||
"auto_cls": self.auto_cls,
|
"auto_cls": self.auto_cls,
|
||||||
"use_tokenizer_eos": self.use_tokenizer_eos,
|
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||||
"postprocess_inputs": self.postprocess_inputs,
|
|
||||||
"comparator": self.comparator,
|
"comparator": self.comparator,
|
||||||
"get_stop_token_ids": self.get_stop_token_ids,
|
"get_stop_token_ids": self.get_stop_token_ids,
|
||||||
"hf_model_kwargs": self.hf_model_kwargs,
|
"hf_model_kwargs": self.hf_model_kwargs,
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from functools import partial
|
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
|
from transformers import Qwen2VLForConditionalGeneration
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||||
from ....utils import large_gpu_test
|
from ....utils import large_gpu_test
|
||||||
@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
|
|||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
|
|
||||||
return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def _run_test(
|
def _run_test(
|
||||||
hf_runner: type[HfRunner],
|
hf_runner: type[HfRunner],
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
@@ -118,14 +114,8 @@ def _run_test(
|
|||||||
with hf_runner(model,
|
with hf_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
|
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
|
||||||
hf_model.postprocess_inputs = partial(
|
|
||||||
postprocess_inputs,
|
prompts = []
|
||||||
hf_model,
|
|
||||||
cache_position=torch.arange(
|
|
||||||
0,
|
|
||||||
1, # 1 for batch size
|
|
||||||
requires_grad=False),
|
|
||||||
use_cache=False)
|
|
||||||
for text, image, embed_text in zip(input_texts, input_images,
|
for text, image, embed_text in zip(input_texts, input_images,
|
||||||
embed_texts):
|
embed_texts):
|
||||||
# dse requires non-standard input processing
|
# dse requires non-standard input processing
|
||||||
@@ -133,20 +123,34 @@ def _run_test(
|
|||||||
messages = get_messages(image, text, embed_text)
|
messages = get_messages(image, text, embed_text)
|
||||||
prompt = apply_chat_template_and_add_eos(
|
prompt = apply_chat_template_and_add_eos(
|
||||||
messages, hf_model.processor.apply_chat_template)
|
messages, hf_model.processor.apply_chat_template)
|
||||||
inputs = hf_model.get_inputs(
|
|
||||||
prompts=[[prompt]],
|
prompts.append(prompt)
|
||||||
images=[[image]],
|
|
||||||
)
|
all_inputs = hf_model.get_inputs(
|
||||||
with torch.no_grad():
|
prompts=prompts,
|
||||||
|
images=input_images,
|
||||||
|
)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
all_outputs = []
|
||||||
|
for inputs in all_inputs:
|
||||||
|
inputs = hf_model.model.prepare_inputs_for_generation(
|
||||||
|
**inputs,
|
||||||
|
cache_position=torch.arange(1), # 1 for batch size
|
||||||
|
use_cache=False,
|
||||||
|
)
|
||||||
outputs = hf_model.model(
|
outputs = hf_model.model(
|
||||||
**hf_model.wrap_device(inputs[0],
|
**hf_model.wrap_device(inputs),
|
||||||
device=hf_model.model.device.type),
|
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
output_hidden_states=True,
|
output_hidden_states=True,
|
||||||
)
|
)
|
||||||
pooled_output = torch.nn.functional.normalize(
|
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
|
||||||
outputs.hidden_states[-1][0, -1], p=2, dim=-1)
|
p=2,
|
||||||
hf_outputs.append(pooled_output.tolist())
|
dim=-1)
|
||||||
|
|
||||||
|
all_outputs.append(pooled_output.tolist())
|
||||||
|
|
||||||
|
hf_outputs = all_outputs
|
||||||
|
|
||||||
check_embeddings_close(
|
check_embeddings_close(
|
||||||
embeddings_0_lst=hf_outputs,
|
embeddings_0_lst=hf_outputs,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import AutoModelForVision2Seq
|
from transformers import AutoModelForImageTextToText
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ def _run_test(
|
|||||||
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype,
|
with hf_runner(model, dtype=dtype,
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||||
# Patch the issue where generation_config.json is missing
|
# Patch the issue where generation_config.json is missing
|
||||||
hf_model.processor.patch_size = \
|
hf_model.processor.patch_size = \
|
||||||
hf_model.model.config.vision_config.patch_size
|
hf_model.model.config.vision_config.patch_size
|
||||||
@@ -86,8 +86,7 @@ def _run_test(
|
|||||||
for inputs in all_inputs:
|
for inputs in all_inputs:
|
||||||
# Based on: https://huggingface.co/royokong/e5-v
|
# Based on: https://huggingface.co/royokong/e5-v
|
||||||
outputs = hf_model.model(
|
outputs = hf_model.model(
|
||||||
**hf_model.wrap_device(inputs,
|
**hf_model.wrap_device(inputs),
|
||||||
device=hf_model.model.device.type),
|
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
output_hidden_states=True,
|
output_hidden_states=True,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -53,8 +53,7 @@ def _run_test(
|
|||||||
for inputs in all_inputs:
|
for inputs in all_inputs:
|
||||||
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
|
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
|
||||||
outputs = hf_model.model(
|
outputs = hf_model.model(
|
||||||
**hf_model.wrap_device(inputs,
|
**hf_model.wrap_device(inputs),
|
||||||
device=hf_model.model.device.type),
|
|
||||||
return_dict=True,
|
return_dict=True,
|
||||||
output_hidden_states=True,
|
output_hidden_states=True,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -4,8 +4,7 @@ from typing import Optional, overload
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
|
||||||
BatchEncoding)
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
||||||
@@ -227,14 +226,10 @@ def _run_test(
|
|||||||
for prompts, images in inputs
|
for prompts, images in inputs
|
||||||
]
|
]
|
||||||
|
|
||||||
def process(hf_inputs: BatchEncoding, **kwargs):
|
|
||||||
return hf_inputs
|
|
||||||
|
|
||||||
with hf_runner(model,
|
with hf_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
model_kwargs={"device_map": "auto"},
|
model_kwargs={"device_map": "auto"},
|
||||||
postprocess_inputs=process,
|
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
|
||||||
hf_outputs_per_image = [
|
hf_outputs_per_image = [
|
||||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
|||||||
1
tests/models/fixtures/mistral_small_3_chat.json
Normal file
1
tests/models/fixtures/mistral_small_3_chat.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from typing import Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@@ -254,9 +254,9 @@ def check_logprobs_close(
|
|||||||
def build_model_context(
|
def build_model_context(
|
||||||
model_id: str,
|
model_id: str,
|
||||||
task: TaskOption = "auto",
|
task: TaskOption = "auto",
|
||||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
dtype: Union[str, torch.dtype] = "auto",
|
||||||
mm_processor_kwargs: Optional[dict] = None,
|
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||||
limit_mm_per_prompt: Optional[dict] = None,
|
limit_mm_per_prompt: Optional[dict[str, int]] = None,
|
||||||
disable_mm_preprocessor_cache: bool = True,
|
disable_mm_preprocessor_cache: bool = True,
|
||||||
):
|
):
|
||||||
"""Creates an InputContext for a given model.
|
"""Creates an InputContext for a given model.
|
||||||
@@ -274,9 +274,6 @@ def build_model_context(
|
|||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
model_info.check_transformers_version(on_fail="skip")
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
|
||||||
if dtype is None:
|
|
||||||
dtype = "half"
|
|
||||||
|
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
model_id,
|
model_id,
|
||||||
task=task,
|
task=task,
|
||||||
|
|||||||
@@ -7,19 +7,25 @@ from unittest.mock import MagicMock
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch
|
||||||
from transformers import ProcessorMixin
|
from transformers import ProcessorMixin
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
|
||||||
|
MultiModalKwargsItem,
|
||||||
|
MultiModalSharedField)
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
|
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
|
||||||
PromptIndexTargets, PromptInsertion,
|
ProcessingCache, PromptIndexTargets,
|
||||||
PromptReplacement, apply_text_matches,
|
PromptInsertion, PromptReplacement,
|
||||||
|
apply_text_matches,
|
||||||
apply_token_matches,
|
apply_token_matches,
|
||||||
find_mm_placeholders,
|
find_mm_placeholders,
|
||||||
find_text_matches, find_token_matches,
|
find_text_matches, find_token_matches,
|
||||||
iter_token_matches)
|
iter_token_matches,
|
||||||
|
replace_token_matches)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.multimodal.profiling import MultiModalProfiler
|
from vllm.multimodal.profiling import MultiModalProfiler
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||||
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
|
|||||||
assert all(match_len == len(match_ids) for match_len in match_lens)
|
assert all(match_len == len(match_ids) for match_len in match_lens)
|
||||||
|
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("token_ids", "match_ids", "new_ids", "expected"),
|
||||||
|
[
|
||||||
|
([], [], [-1], []),
|
||||||
|
([], [32000], [-1], []),
|
||||||
|
(
|
||||||
|
[32000, 32000, 32000],
|
||||||
|
[32000],
|
||||||
|
[-1],
|
||||||
|
[-1, -1, -1],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[32000, 32000, 32000],
|
||||||
|
[32000, 32000],
|
||||||
|
[-1],
|
||||||
|
[-1, 32000],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[32000, 32000, 32000],
|
||||||
|
[32000, 32000, 32000],
|
||||||
|
[-1],
|
||||||
|
[-1],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
|
[28747, 32000],
|
||||||
|
[-1],
|
||||||
|
[9833, -1, 32000, 32000, 9833, -1, 32000, 918],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
|
[28747, 32000, 32000, 32000],
|
||||||
|
[-1],
|
||||||
|
[9833, -1, 9833, 28747, 32000, 32000, 918],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
|
[28747, 0, 32000],
|
||||||
|
[-1],
|
||||||
|
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# yapf: enable
|
||||||
|
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||||
|
result = replace_token_matches(token_ids, match_ids, new_ids)
|
||||||
|
|
||||||
|
# Manually constructed results
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("prompt", "target_by_key", "expected_by_key"),
|
("prompt", "target_by_key", "expected_by_key"),
|
||||||
@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
|
|||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def _dummy_elem(modality: str, key: str, size: int):
|
||||||
|
return MultiModalFieldElem(
|
||||||
|
modality=modality,
|
||||||
|
key=key,
|
||||||
|
data=torch.empty((size, ), dtype=torch.int8),
|
||||||
|
field=MultiModalSharedField(1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _dummy_item(modality: str, size_by_key: dict[str, int]):
|
||||||
|
return MultiModalKwargsItem.from_elems([
|
||||||
|
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
|
||||||
|
return MultiModalKwargs.from_items([
|
||||||
|
_dummy_item(modality, size_by_key)
|
||||||
|
for modality, size_by_key in size_by_key_modality.items()
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("item", "expected_size"),
|
||||||
|
[
|
||||||
|
(_dummy_item("a", {"a1": 100}), 100),
|
||||||
|
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
||||||
|
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# yapf: enable
|
||||||
|
def test_cache_item_size(item, expected_size):
|
||||||
|
cache = ProcessingCache.get_lru_cache(2048, type(item))
|
||||||
|
cache[""] = item
|
||||||
|
|
||||||
|
assert cache.currsize == expected_size
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("limit", "num_supported", "is_valid"),
|
("limit", "num_supported", "is_valid"),
|
||||||
@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
|||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
trust_remote_code=False,
|
trust_remote_code=False,
|
||||||
seed=0,
|
seed=0,
|
||||||
dtype="half",
|
dtype="auto",
|
||||||
revision=None,
|
revision=None,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
|||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
trust_remote_code=False,
|
trust_remote_code=False,
|
||||||
seed=0,
|
seed=0,
|
||||||
dtype="half",
|
dtype="auto",
|
||||||
revision=None,
|
revision=None,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
|||||||
tokenizer_mode="auto",
|
tokenizer_mode="auto",
|
||||||
trust_remote_code=False,
|
trust_remote_code=False,
|
||||||
seed=0,
|
seed=0,
|
||||||
dtype="half",
|
dtype="auto",
|
||||||
revision=None,
|
revision=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
|
|||||||
|
|
||||||
# Test edge cases
|
# Test edge cases
|
||||||
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
|
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
|
||||||
(16, 4, 8, 8192, 48, 1, 128, True), # large prefill batch
|
(16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch
|
||||||
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
|
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
|
||||||
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
|
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
|
||||||
])
|
])
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
|
|||||||
|
|
||||||
models_4bit_to_test = [
|
models_4bit_to_test = [
|
||||||
("facebook/opt-125m", "quantize opt model inflight"),
|
("facebook/opt-125m", "quantize opt model inflight"),
|
||||||
|
("mistralai/Mistral-7B-Instruct-v0.3",
|
||||||
|
"quantize inflight model with both HF and Mistral format weights")
|
||||||
]
|
]
|
||||||
|
|
||||||
models_pre_qaunt_4bit_to_test = [
|
models_pre_qaunt_4bit_to_test = [
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
|
|||||||
test_prompts = multilora_inference.create_test_prompts(lora_path)
|
test_prompts = multilora_inference.create_test_prompts(lora_path)
|
||||||
|
|
||||||
# Serialize model before deserializing and binding LoRA adapters
|
# Serialize model before deserializing and binding LoRA adapters
|
||||||
with vllm_runner(model_ref, ) as vllm_model:
|
with vllm_runner(model_ref) as vllm_model:
|
||||||
model_path = tmp_path / (model_ref + ".tensors")
|
model_path = tmp_path / (model_ref + ".tensors")
|
||||||
|
|
||||||
vllm_model.apply_model(
|
vllm_model.apply_model(
|
||||||
@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
|
|||||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||||
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
|
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
|
||||||
## Serialize model
|
## Serialize model
|
||||||
with vllm_runner(model_ref, ) as vllm_model:
|
with vllm_runner(model_ref) as vllm_model:
|
||||||
model_path = tmp_path / (model_ref + ".tensors")
|
model_path = tmp_path / (model_ref + ".tensors")
|
||||||
|
|
||||||
vllm_model.apply_model(
|
vllm_model.apply_model(
|
||||||
|
|||||||
@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
|
|||||||
|
|
||||||
# disable custom dispatcher, let Dynamo takes over
|
# disable custom dispatcher, let Dynamo takes over
|
||||||
# all the control
|
# all the control
|
||||||
llm = LLM(model="google/gemma-2b",
|
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
|
max_model_len=512,
|
||||||
|
max_num_seqs=64,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
|
|||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
assert generated_text.startswith(answer)
|
assert generated_text.startswith(answer)
|
||||||
|
|
||||||
compiled_code = sorted(
|
compiled_codes = sorted(
|
||||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||||
|
|
||||||
# we should only trigger Dynamo compilation three times:
|
for i, compiled_code in enumerate(compiled_codes):
|
||||||
# one for the profiling phase without kv cache
|
print("{} file: {}".format(i + 1, compiled_code))
|
||||||
# one for the prefill phase with symbolic shapes
|
|
||||||
# one for the decode phase with symbolic shapes
|
# We should only trigger Dynamo compilation 4 times:
|
||||||
|
# 1. forward pass (symbolic)
|
||||||
|
# 2. compute_logits (symbolic)
|
||||||
|
# 3. forward pass (shape 16)
|
||||||
|
# 4. forward pass (shape 32)
|
||||||
# and later calls should not trigger Dynamo compilation again.
|
# and later calls should not trigger Dynamo compilation again.
|
||||||
# NOTE: it might still trigger XLA compilation.
|
# NOTE: It might still trigger XLA compilation.
|
||||||
|
|
||||||
# check we have three compiled code
|
# Check we have 4 compiled codes
|
||||||
# this is the assumption when we use the custom dispatcher
|
assert len(compiled_codes) == 4
|
||||||
assert len(compiled_code) == 3
|
|
||||||
|
|
||||||
# check all the compilations are as expected
|
kv_cache_prefix = "kv_cache"
|
||||||
compiled_fn = sorted(
|
attn_prefix = "ragged_paged_attention"
|
||||||
|
|
||||||
|
# Check all the compilations are as expected
|
||||||
|
compiled_fns = sorted(
|
||||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||||
|
|
||||||
# the first compilation is the profiling phase,
|
for i, compiled_fn in enumerate(compiled_fns):
|
||||||
# it should not have any kv cache
|
print("{} file: {}".format(i + 1, compiled_fn))
|
||||||
with open(compiled_fn[0]) as f:
|
|
||||||
content = f.read()
|
|
||||||
assert "kv_caches" not in content
|
|
||||||
|
|
||||||
# the second compilation is the prefill phase,
|
# The first compilation is symbolic, so it should not have any kv_caches
|
||||||
# it should have kv cache and the flash_attention op
|
with open(compiled_fns[0]) as f:
|
||||||
with open(compiled_fn[1]) as f:
|
|
||||||
content = f.read()
|
content = f.read()
|
||||||
assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
|
assert kv_cache_prefix not in content
|
||||||
|
|
||||||
# the third compilation is the decode phase,
|
# The second compilation is symbolic, so it should not have any kv_caches
|
||||||
# it should have kv cache and the paged_attention op
|
with open(compiled_fns[1]) as f:
|
||||||
with open(compiled_fn[2]) as f:
|
|
||||||
content = f.read()
|
content = f.read()
|
||||||
assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
|
assert kv_cache_prefix not in content
|
||||||
|
|
||||||
|
# The third compilation is shape 16, so it should have kv_caches and the
|
||||||
|
# ragged_paged_attention
|
||||||
|
with open(compiled_fns[2]) as f:
|
||||||
|
content = f.read()
|
||||||
|
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||||
|
|
||||||
|
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||||
|
# ragged_paged_attention
|
||||||
|
with open(compiled_fns[3]) as f:
|
||||||
|
content = f.read()
|
||||||
|
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||||
|
|||||||
@@ -14,12 +14,17 @@ from ..utils import compare_two_settings
|
|||||||
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
||||||
compare_two_settings(
|
compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
"google/gemma-2b",
|
arg1=[
|
||||||
arg1=[
|
"--max-model-len=256",
|
||||||
"--enforce-eager",
|
"--max-num-seqs=32",
|
||||||
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
"--enforce-eager",
|
||||||
],
|
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
||||||
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
|
],
|
||||||
env1={},
|
arg2=[
|
||||||
env2={})
|
"--max-model-len=256", "--max-num-seqs=32",
|
||||||
|
"--enforce-eager",
|
||||||
|
f"-O{CompilationLevel.DYNAMO_AS_IS}"
|
||||||
|
],
|
||||||
|
env1={},
|
||||||
|
env2={})
|
||||||
|
|||||||
@@ -76,21 +76,18 @@ async def generate(engine: AsyncLLM,
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
||||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
@pytest.mark.parametrize("engine_args,prompt",
|
||||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_load(
|
async def test_load(monkeypatch: pytest.MonkeyPatch,
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
output_kind: RequestOutputKind,
|
||||||
output_kind: RequestOutputKind,
|
engine_args: AsyncEngineArgs, prompt: PromptType):
|
||||||
engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
|
|
||||||
):
|
|
||||||
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
||||||
# so that in the future when we switch, we don't have to change all the
|
# so that in the future when we switch, we don't have to change all the
|
||||||
# tests.
|
# tests.
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with monkeypatch.context() as m, ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
engine_args, prompt = engine_args_and_prompt
|
|
||||||
|
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -124,18 +121,16 @@ async def test_load(
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
||||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
@pytest.mark.parametrize("engine_args,prompt",
|
||||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_abort(monkeypatch: pytest.MonkeyPatch,
|
async def test_abort(monkeypatch: pytest.MonkeyPatch,
|
||||||
output_kind: RequestOutputKind,
|
output_kind: RequestOutputKind,
|
||||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
engine_args: AsyncEngineArgs, prompt: PromptType):
|
||||||
PromptType]):
|
|
||||||
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with monkeypatch.context() as m, ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
engine_args, prompt = engine_args_and_prompt
|
|
||||||
|
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -193,17 +188,15 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n", [1, 3])
|
@pytest.mark.parametrize("n", [1, 3])
|
||||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
@pytest.mark.parametrize("engine_args,prompt",
|
||||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_finished_flag(monkeypatch, n: int,
|
async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
|
||||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
engine_args: AsyncEngineArgs, prompt: PromptType):
|
||||||
PromptType]):
|
|
||||||
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with monkeypatch.context() as m, ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
engine_args, prompt = engine_args_and_prompt
|
|
||||||
|
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ def _get_test_sampling_params(
|
|||||||
"""Generate random sampling params for a batch."""
|
"""Generate random sampling params for a batch."""
|
||||||
|
|
||||||
def get_mostly_n_gt1() -> int:
|
def get_mostly_n_gt1() -> int:
|
||||||
"""Mostly n \in [2,20], ~1/3 n=1"""
|
r"""Mostly n \in [2,20], ~1/3 n=1"""
|
||||||
x = random.randint(0, 28)
|
x = random.randint(0, 28)
|
||||||
if x < 10:
|
if x < 10:
|
||||||
return 1
|
return 1
|
||||||
|
|||||||
@@ -6,20 +6,23 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
|
from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
|
||||||
|
RejectionSampler)
|
||||||
|
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||||
|
|
||||||
DEVICE = "cpu"
|
DEVICE = "cuda"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sampler():
|
def rejection_sampler():
|
||||||
return RejectionSampler()
|
return RejectionSampler()
|
||||||
|
|
||||||
|
|
||||||
def create_logits_tensor(token_ids: list[list[int]],
|
def create_logits_tensor(output_token_ids: list[list[int]],
|
||||||
vocab_size: int = 100) -> torch.Tensor:
|
vocab_size: int = 100) -> torch.Tensor:
|
||||||
"""Helper function to create logits tensor that
|
"""Helper function to create logits tensor that
|
||||||
will produce desired token ids on argmax"""
|
will produce desired token ids on argmax"""
|
||||||
|
token_ids = [tokens[:-1] for tokens in output_token_ids]
|
||||||
num_total_tokens = sum(len(tokens) for tokens in token_ids)
|
num_total_tokens = sum(len(tokens) for tokens in token_ids)
|
||||||
logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
|
logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
|
||||||
start_loc = 0
|
start_loc = 0
|
||||||
@@ -31,15 +34,22 @@ def create_logits_tensor(token_ids: list[list[int]],
|
|||||||
|
|
||||||
|
|
||||||
def create_sampling_metadata(
|
def create_sampling_metadata(
|
||||||
all_greedy: bool,
|
all_greedy: bool,
|
||||||
generators: Optional[dict[int, Any]] = None) -> SamplingMetadata:
|
temperature: Optional[torch.Tensor] = None,
|
||||||
|
generators: Optional[dict[int, Any]] = None,
|
||||||
|
) -> SamplingMetadata:
|
||||||
"""Create a v1 sampling metadata object with all_greedy set
|
"""Create a v1 sampling metadata object with all_greedy set
|
||||||
to the given value. Either all greedy or all random sampling
|
to the given value. Either all greedy or all random sampling
|
||||||
is used.
|
is used.
|
||||||
"""
|
"""
|
||||||
generators = generators or {}
|
generators = generators or {}
|
||||||
|
if all_greedy:
|
||||||
|
temperature = None
|
||||||
|
else:
|
||||||
|
assert temperature is not None
|
||||||
|
|
||||||
return SamplingMetadata(
|
return SamplingMetadata(
|
||||||
temperature=torch.tensor([]),
|
temperature=temperature,
|
||||||
all_greedy=all_greedy,
|
all_greedy=all_greedy,
|
||||||
all_random=not all_greedy,
|
all_random=not all_greedy,
|
||||||
top_p=None,
|
top_p=None,
|
||||||
@@ -61,7 +71,7 @@ def create_sampling_metadata(
|
|||||||
|
|
||||||
|
|
||||||
########################### Tests for Greedy Sampling ###################
|
########################### Tests for Greedy Sampling ###################
|
||||||
def test_perfect_match(sampler):
|
def test_perfect_match(rejection_sampler):
|
||||||
"""Test when output tokens perfectly match speculated tokens"""
|
"""Test when output tokens perfectly match speculated tokens"""
|
||||||
spec_tokens = [[1, 2, 3]]
|
spec_tokens = [[1, 2, 3]]
|
||||||
output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token
|
output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token
|
||||||
@@ -70,15 +80,23 @@ def test_perfect_match(sampler):
|
|||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
|
spec_decode_metadata,
|
||||||
|
draft_probs=None,
|
||||||
|
target_logits=logits,
|
||||||
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
expected = torch.tensor([[1, 2, 3, 4]],
|
expected = torch.tensor([[1, 2, 3, 4]],
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
assert torch.equal(output, expected)
|
assert torch.equal(output, expected)
|
||||||
|
|
||||||
|
|
||||||
def test_early_mismatch(sampler):
|
def test_early_mismatch(rejection_sampler):
|
||||||
"""Test when there's an early mismatch in tokens"""
|
"""Test when there's an early mismatch in tokens"""
|
||||||
spec_tokens = [[1, 2, 3]]
|
spec_tokens = [[1, 2, 3]]
|
||||||
output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1
|
output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1
|
||||||
@@ -87,15 +105,25 @@ def test_early_mismatch(sampler):
|
|||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
|
spec_decode_metadata,
|
||||||
dtype=torch.int,
|
draft_probs=None,
|
||||||
device=logits.device)
|
target_logits=logits,
|
||||||
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
|
expected = torch.tensor(
|
||||||
|
[[1, 5, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
|
||||||
|
dtype=torch.int,
|
||||||
|
device=logits.device,
|
||||||
|
)
|
||||||
assert torch.equal(output, expected)
|
assert torch.equal(output, expected)
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_sequences(sampler):
|
def test_multiple_sequences(rejection_sampler):
|
||||||
"""Test handling multiple sequences of speculated tokens"""
|
"""Test handling multiple sequences of speculated tokens"""
|
||||||
spec_tokens = [[1, 2], [3]]
|
spec_tokens = [[1, 2], [3]]
|
||||||
output_tokens = [[1, 2, 5], [3,
|
output_tokens = [[1, 2, 5], [3,
|
||||||
@@ -105,15 +133,23 @@ def test_multiple_sequences(sampler):
|
|||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor(
|
bonus_token_tensor = torch.tensor(
|
||||||
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
|
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
|
spec_decode_metadata,
|
||||||
|
draft_probs=None,
|
||||||
|
target_logits=logits,
|
||||||
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
|
expected = torch.tensor([[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]],
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
assert torch.equal(output, expected)
|
assert torch.equal(output, expected)
|
||||||
|
|
||||||
|
|
||||||
def test_single_token_sequence(sampler):
|
def test_single_token_sequence(rejection_sampler):
|
||||||
"""Test handling sequences with single token"""
|
"""Test handling sequences with single token"""
|
||||||
spec_tokens = [[1]]
|
spec_tokens = [[1]]
|
||||||
output_tokens = [[1, 2]] # Single token with bonus token 2
|
output_tokens = [[1, 2]] # Single token with bonus token 2
|
||||||
@@ -122,13 +158,21 @@ def test_single_token_sequence(sampler):
|
|||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
|
spec_decode_metadata,
|
||||||
|
draft_probs=None,
|
||||||
|
target_logits=logits,
|
||||||
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
|
expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
|
||||||
assert torch.equal(output, expected)
|
assert torch.equal(output, expected)
|
||||||
|
|
||||||
|
|
||||||
def test_empty_sequence(sampler):
|
def test_empty_sequence(rejection_sampler):
|
||||||
"""Test handling empty sequence of speculated tokens"""
|
"""Test handling empty sequence of speculated tokens"""
|
||||||
spec_tokens: list[list[int]] = [[]]
|
spec_tokens: list[list[int]] = [[]]
|
||||||
output_tokens = [[5]] # Just the bonus token
|
output_tokens = [[5]] # Just the bonus token
|
||||||
@@ -137,13 +181,21 @@ def test_empty_sequence(sampler):
|
|||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
|
spec_decode_metadata,
|
||||||
|
draft_probs=None,
|
||||||
|
target_logits=logits,
|
||||||
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
|
expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
|
||||||
assert torch.equal(output, expected)
|
assert torch.equal(output, expected)
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_mismatches(sampler):
|
def test_multiple_mismatches(rejection_sampler):
|
||||||
"""Test handling multiple sequences with mismatches"""
|
"""Test handling multiple sequences with mismatches"""
|
||||||
spec_tokens = [[1, 2, 3], [4, 5, 6]]
|
spec_tokens = [[1, 2, 3], [4, 5, 6]]
|
||||||
output_tokens = [[1, 2, 7, 6], [4, 8, 6,
|
output_tokens = [[1, 2, 7, 6], [4, 8, 6,
|
||||||
@@ -153,12 +205,22 @@ def test_multiple_mismatches(sampler):
|
|||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor(
|
bonus_token_tensor = torch.tensor(
|
||||||
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
|
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
|
spec_decode_metadata,
|
||||||
[4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
|
draft_probs=None,
|
||||||
dtype=torch.int,
|
target_logits=logits,
|
||||||
device=logits.device)
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
|
expected = torch.tensor(
|
||||||
|
[[1, 2, 7, PLACEHOLDER_TOKEN_ID],
|
||||||
|
[4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
|
||||||
|
dtype=torch.int,
|
||||||
|
device=logits.device,
|
||||||
|
)
|
||||||
assert torch.equal(output, expected)
|
assert torch.equal(output, expected)
|
||||||
|
|
||||||
|
|
||||||
@@ -166,18 +228,27 @@ def test_multiple_mismatches(sampler):
|
|||||||
"spec_tokens,output_tokens,expected",
|
"spec_tokens,output_tokens,expected",
|
||||||
[
|
[
|
||||||
([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus
|
([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus
|
||||||
([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]), # First mismatch
|
([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]), # First mismatch
|
||||||
([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
|
([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
|
||||||
[[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]), # Mixed matches
|
[[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]), # Mixed matches
|
||||||
])
|
])
|
||||||
def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
|
def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
|
||||||
|
expected):
|
||||||
"""Parametrized test for various matching scenarios"""
|
"""Parametrized test for various matching scenarios"""
|
||||||
metadata = create_sampling_metadata(all_greedy=True)
|
metadata = create_sampling_metadata(all_greedy=True)
|
||||||
logits = create_logits_tensor(output_tokens)
|
logits = create_logits_tensor(output_tokens)
|
||||||
bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
|
bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||||
|
device=logits.device)
|
||||||
|
|
||||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
output = rejection_sampler(
|
||||||
|
spec_decode_metadata,
|
||||||
|
draft_probs=None,
|
||||||
|
target_logits=logits,
|
||||||
|
bonus_token_ids=bonus_token_tensor,
|
||||||
|
sampling_metadata=metadata,
|
||||||
|
)
|
||||||
expected_tensor = torch.tensor(expected,
|
expected_tensor = torch.tensor(expected,
|
||||||
dtype=torch.int,
|
dtype=torch.int,
|
||||||
device=logits.device)
|
device=logits.device)
|
||||||
@@ -190,21 +261,31 @@ def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
|
|||||||
@pytest.mark.parametrize("batch_size", [1, 4, 8])
|
@pytest.mark.parametrize("batch_size", [1, 4, 8])
|
||||||
@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
|
@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
|
||||||
@pytest.mark.parametrize("n_rep", [20])
|
@pytest.mark.parametrize("n_rep", [20])
|
||||||
def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
|
def test_deterministic_when_seeded(
|
||||||
batch_size: int, frac_seeded: float,
|
rejection_sampler,
|
||||||
n_rep: int):
|
k: int,
|
||||||
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
vocab_size: int,
|
||||||
target_probs = torch.rand(batch_size * (k + 1),
|
batch_size: int,
|
||||||
vocab_size,
|
frac_seeded: float,
|
||||||
dtype=torch.float32)
|
n_rep: int,
|
||||||
|
):
|
||||||
|
num_tokens = batch_size * k
|
||||||
|
draft_probs = torch.rand(num_tokens,
|
||||||
|
vocab_size,
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=DEVICE)
|
||||||
|
draft_probs = F.softmax(draft_probs, dim=-1)
|
||||||
|
target_logits = torch.rand_like(draft_probs)
|
||||||
bonus_token_ids = torch.randint(low=0,
|
bonus_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, 1),
|
size=(batch_size, 1),
|
||||||
dtype=torch.int64)
|
dtype=torch.int64,
|
||||||
|
device=DEVICE)
|
||||||
draft_token_ids = torch.randint(low=0,
|
draft_token_ids = torch.randint(low=0,
|
||||||
high=vocab_size,
|
high=vocab_size,
|
||||||
size=(batch_size, k),
|
size=(batch_size, k),
|
||||||
dtype=torch.int64)
|
dtype=torch.int64,
|
||||||
|
device=DEVICE)
|
||||||
|
|
||||||
seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
|
seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
|
||||||
|
|
||||||
@@ -215,10 +296,21 @@ def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
|
|||||||
for i in range(batch_size) if seeded_mask[i]
|
for i in range(batch_size) if seeded_mask[i]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
temperature = torch.ones(batch_size,
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=DEVICE)
|
||||||
sampling_metadata = create_sampling_metadata(all_greedy=False,
|
sampling_metadata = create_sampling_metadata(all_greedy=False,
|
||||||
|
temperature=temperature,
|
||||||
generators=seeded_seqs)
|
generators=seeded_seqs)
|
||||||
rep_result = sampler(draft_token_ids.tolist(), draft_probs,
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
|
||||||
bonus_token_ids, target_probs, sampling_metadata)
|
draft_token_ids.tolist(), device=DEVICE)
|
||||||
|
rep_result = rejection_sampler(
|
||||||
|
spec_decode_metadata,
|
||||||
|
draft_probs=draft_probs,
|
||||||
|
target_logits=target_logits,
|
||||||
|
bonus_token_ids=bonus_token_ids,
|
||||||
|
sampling_metadata=sampling_metadata,
|
||||||
|
)
|
||||||
|
|
||||||
results.append(rep_result)
|
results.append(rep_result)
|
||||||
|
|
||||||
@@ -257,10 +349,10 @@ def test_rejection_sampling_approximates_target_distribution():
|
|||||||
num_reference_probs = 100
|
num_reference_probs = 100
|
||||||
|
|
||||||
# Prepare draft, target, and reference probability distributions
|
# Prepare draft, target, and reference probability distributions
|
||||||
draft_probs, target_probs = (F.softmax(
|
draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32),
|
||||||
torch.rand(vocab_size, dtype=torch.float32),
|
dim=-1)
|
||||||
dim=-1,
|
target_logits = torch.rand(vocab_size, dtype=torch.float32)
|
||||||
) for _ in range(2))
|
target_probs = F.softmax(target_logits, dim=-1)
|
||||||
reference_probs = F.softmax(
|
reference_probs = F.softmax(
|
||||||
torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
|
torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
|
||||||
dim=-1,
|
dim=-1,
|
||||||
@@ -273,7 +365,7 @@ def test_rejection_sampling_approximates_target_distribution():
|
|||||||
for num_samples in sample_sizes:
|
for num_samples in sample_sizes:
|
||||||
# Sample using rejection sampling.
|
# Sample using rejection sampling.
|
||||||
rej_sample_probs = estimate_rejection_sampling_pdf(
|
rej_sample_probs = estimate_rejection_sampling_pdf(
|
||||||
draft_probs, target_probs, k, vocab_size, num_samples)
|
draft_probs, target_logits, k, vocab_size, num_samples)
|
||||||
rej_sample_probs = rej_sample_probs.to(DEVICE)
|
rej_sample_probs = rej_sample_probs.to(DEVICE)
|
||||||
|
|
||||||
# Average distance from reference probs.
|
# Average distance from reference probs.
|
||||||
@@ -313,7 +405,7 @@ def get_ratio_first_to_last(elements: list[float]) -> float:
|
|||||||
|
|
||||||
def estimate_rejection_sampling_pdf(
|
def estimate_rejection_sampling_pdf(
|
||||||
draft_probs: torch.Tensor,
|
draft_probs: torch.Tensor,
|
||||||
target_probs: torch.Tensor,
|
target_logits: torch.Tensor,
|
||||||
k: int,
|
k: int,
|
||||||
vocab_size: int,
|
vocab_size: int,
|
||||||
num_samples: int,
|
num_samples: int,
|
||||||
@@ -323,35 +415,44 @@ def estimate_rejection_sampling_pdf(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
draft_probs: Draft probability distribution.
|
draft_probs: Draft probability distribution.
|
||||||
target_probs: Target probability distribution.
|
target_logits: Target logits.
|
||||||
num_samples: Number of samples to draw.
|
num_samples: Number of samples to draw.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Estimated probability distribution of the output tokens.
|
Estimated probability distribution of the output tokens.
|
||||||
"""
|
"""
|
||||||
sampler = RejectionSampler()
|
rejection_sampler = RejectionSampler()
|
||||||
# Repeat draft probs num_samples times.
|
num_tokens = num_samples * k
|
||||||
|
# Repeat draft probs num_samples * k times.
|
||||||
draft_probs = draft_probs.reshape(1, 1,
|
draft_probs = draft_probs.reshape(1, 1,
|
||||||
vocab_size).repeat(num_samples, k, 1)
|
vocab_size).repeat(num_samples, k, 1)
|
||||||
|
|
||||||
# Repeat target probs num_samples * (k + 1) times.
|
# Repeat target probs num_tokens times.
|
||||||
target_probs = target_probs.reshape(1, 1, vocab_size).repeat(
|
target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
|
||||||
num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size)
|
|
||||||
|
|
||||||
# Randomly sample draft token ids from draft probs.
|
# Randomly sample draft token ids from draft probs.
|
||||||
draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
|
draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
|
||||||
num_samples=k,
|
num_samples=k,
|
||||||
replacement=True).reshape(
|
replacement=True).reshape(
|
||||||
num_samples, k)
|
num_samples, k)
|
||||||
|
draft_probs = draft_probs.view(num_tokens, vocab_size)
|
||||||
|
|
||||||
# Bonus tokens not used but required.
|
# Bonus tokens not used but required.
|
||||||
bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
|
bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
|
||||||
device=DEVICE).repeat(num_samples, 1)
|
device=DEVICE).repeat(num_samples, 1)
|
||||||
|
|
||||||
sampling_metadata = create_sampling_metadata(all_greedy=False)
|
temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
|
||||||
output_token_ids = sampler(draft_token_ids.tolist(), draft_probs,
|
sampling_metadata = create_sampling_metadata(all_greedy=False,
|
||||||
bonus_token_ids, target_probs,
|
temperature=temperature)
|
||||||
sampling_metadata)
|
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
|
||||||
|
draft_token_ids.tolist(), device=bonus_token_ids.device)
|
||||||
|
output_token_ids = rejection_sampler(
|
||||||
|
spec_decode_metadata,
|
||||||
|
draft_probs=draft_probs,
|
||||||
|
target_logits=target_logits,
|
||||||
|
bonus_token_ids=bonus_token_ids,
|
||||||
|
sampling_metadata=sampling_metadata,
|
||||||
|
)
|
||||||
output_token_ids = output_token_ids[:, :-1].flatten()
|
output_token_ids = output_token_ids[:, :-1].flatten()
|
||||||
|
|
||||||
hist = torch.histogram(output_token_ids.to(dtype=torch.float,
|
hist = torch.histogram(output_token_ids.to(dtype=torch.float,
|
||||||
|
|||||||
@@ -15,9 +15,10 @@ if TYPE_CHECKING:
|
|||||||
from tests.conftest import VllmRunner
|
from tests.conftest import VllmRunner
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
|
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
|
# TODO: Enable this models with v6e
|
||||||
# "Qwen/Qwen2-7B-Instruct",
|
# "Qwen/Qwen2-7B-Instruct",
|
||||||
"meta-llama/Llama-3.1-8B",
|
# "meta-llama/Llama-3.1-8B",
|
||||||
# TODO: Add models here as necessary
|
|
||||||
]
|
]
|
||||||
|
|
||||||
TENSOR_PARALLEL_SIZES = [1]
|
TENSOR_PARALLEL_SIZES = [1]
|
||||||
|
|||||||
@@ -347,7 +347,7 @@ class ModelConfig:
|
|||||||
self.encoder_config = self._get_encoder_config()
|
self.encoder_config = self._get_encoder_config()
|
||||||
self.hf_image_processor_config = get_hf_image_processor_config(
|
self.hf_image_processor_config = get_hf_image_processor_config(
|
||||||
self.model, revision)
|
self.model, revision)
|
||||||
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||||
self.use_async_output_proc = use_async_output_proc
|
self.use_async_output_proc = use_async_output_proc
|
||||||
self.mm_processor_kwargs = mm_processor_kwargs
|
self.mm_processor_kwargs = mm_processor_kwargs
|
||||||
self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
|
self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
|
||||||
@@ -2526,6 +2526,14 @@ def _get_and_verify_dtype(
|
|||||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||||
# because config.torch_dtype can be None.
|
# because config.torch_dtype can be None.
|
||||||
config_dtype = getattr(config, "torch_dtype", None)
|
config_dtype = getattr(config, "torch_dtype", None)
|
||||||
|
|
||||||
|
# Fallbacks for multi-modal models if the root config
|
||||||
|
# does not define torch_dtype
|
||||||
|
if config_dtype is None and hasattr(config, "text_config"):
|
||||||
|
config_dtype = getattr(config.text_config, "torch_dtype", None)
|
||||||
|
if config_dtype is None and hasattr(config, "vision_config"):
|
||||||
|
config_dtype = getattr(config.vision_config, "torch_dtype", None)
|
||||||
|
|
||||||
if config_dtype is None:
|
if config_dtype is None:
|
||||||
config_dtype = torch.float32
|
config_dtype = torch.float32
|
||||||
|
|
||||||
@@ -2533,16 +2541,8 @@ def _get_and_verify_dtype(
|
|||||||
dtype = dtype.lower()
|
dtype = dtype.lower()
|
||||||
if dtype == "auto":
|
if dtype == "auto":
|
||||||
if config_dtype == torch.float32:
|
if config_dtype == torch.float32:
|
||||||
if config.model_type in ("gemma2", "gemma3", "gemma3_text"):
|
# Following common practice, we use float16 for float32 models
|
||||||
logger.info(
|
torch_dtype = torch.float16
|
||||||
"For Gemma 2 and 3, we downcast float32 to bfloat16 "
|
|
||||||
"instead of float16 by default. Please specify `dtype` "
|
|
||||||
"if you want to use float16.")
|
|
||||||
torch_dtype = torch.bfloat16
|
|
||||||
else:
|
|
||||||
# Following the common practice, we use float16 for float32
|
|
||||||
# models.
|
|
||||||
torch_dtype = torch.float16
|
|
||||||
else:
|
else:
|
||||||
torch_dtype = config_dtype
|
torch_dtype = config_dtype
|
||||||
|
|
||||||
|
|||||||
@@ -1469,8 +1469,12 @@ class EngineArgs:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
# Need at least Ampere for now (FA support required).
|
# Need at least Ampere for now (FA support required).
|
||||||
|
# Skip this check if we are running on a non-GPU platform,
|
||||||
|
# or if the device capability is not available
|
||||||
|
# (e.g. in a Ray actor without GPUs).
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
if (current_platform.is_cuda()
|
if (current_platform.is_cuda()
|
||||||
|
and current_platform.get_device_capability()
|
||||||
and current_platform.get_device_capability().major < 8):
|
and current_platform.get_device_capability().major < 8):
|
||||||
_raise_or_fallback(feature_name="Compute Capability < 8.0",
|
_raise_or_fallback(feature_name="Compute Capability < 8.0",
|
||||||
recommend_to_remove=False)
|
recommend_to_remove=False)
|
||||||
@@ -1574,6 +1578,13 @@ class EngineArgs:
|
|||||||
_raise_or_fallback(feature_name=name, recommend_to_remove=True)
|
_raise_or_fallback(feature_name=name, recommend_to_remove=True)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# No support for device type other than CUDA, AMD (experiemntal) or
|
||||||
|
# TPU (experimental) so far.
|
||||||
|
if not (current_platform.is_cuda_alike() or current_platform.is_tpu()):
|
||||||
|
_raise_or_fallback(
|
||||||
|
feature_name=f"device type={current_platform.device_type}",
|
||||||
|
recommend_to_remove=False)
|
||||||
|
return False
|
||||||
#############################################################
|
#############################################################
|
||||||
# Experimental Features - allow users to opt in.
|
# Experimental Features - allow users to opt in.
|
||||||
|
|
||||||
|
|||||||
@@ -548,7 +548,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
|||||||
if top_logprobs < 0:
|
if top_logprobs < 0:
|
||||||
raise ValueError("`top_logprobs` must be a positive value.")
|
raise ValueError("`top_logprobs` must be a positive value.")
|
||||||
|
|
||||||
if not data.get("logprobs"):
|
if top_logprobs > 0 and not data.get("logprobs"):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"when using `top_logprobs`, `logprobs` must be set to true."
|
"when using `top_logprobs`, `logprobs` must be set to true."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ if TYPE_CHECKING:
|
|||||||
VLLM_TRACE_FUNCTION: int = 0
|
VLLM_TRACE_FUNCTION: int = 0
|
||||||
VLLM_ATTENTION_BACKEND: Optional[str] = None
|
VLLM_ATTENTION_BACKEND: Optional[str] = None
|
||||||
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
|
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
|
||||||
VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
|
|
||||||
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
|
VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
|
||||||
VLLM_PP_LAYER_PARTITION: Optional[str] = None
|
VLLM_PP_LAYER_PARTITION: Optional[str] = None
|
||||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||||
|
|||||||
@@ -16,12 +16,8 @@ import torch
|
|||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.triton_utils.importing import HAS_TRITON
|
|
||||||
from vllm.utils import _check_multiproc_method, get_mp_context, run_method
|
from vllm.utils import _check_multiproc_method, get_mp_context, run_method
|
||||||
|
|
||||||
if HAS_TRITON:
|
|
||||||
from vllm.triton_utils import maybe_set_triton_cache_manager
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
T = TypeVar('T')
|
T = TypeVar('T')
|
||||||
@@ -314,7 +310,3 @@ def set_multiprocessing_worker_envs(parallel_config):
|
|||||||
current_parallelism, default_omp_num_threads)
|
current_parallelism, default_omp_num_threads)
|
||||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||||
torch.set_num_threads(default_omp_num_threads)
|
torch.set_num_threads(default_omp_num_threads)
|
||||||
|
|
||||||
# workaround for https://github.com/vllm-project/vllm/issues/6103
|
|
||||||
if HAS_TRITON and parallel_config.world_size > 1:
|
|
||||||
maybe_set_triton_cache_manager()
|
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor,
|
|||||||
is_regex_target_modules,
|
is_regex_target_modules,
|
||||||
parse_fine_tuned_lora_name, replace_submodule)
|
parse_fine_tuned_lora_name, replace_submodule)
|
||||||
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
|
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
|
||||||
|
from vllm.model_executor.models.interfaces import is_pooling_model
|
||||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||||
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
|
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
|
||||||
from vllm.utils import is_pin_memory_available
|
from vllm.utils import is_pin_memory_available
|
||||||
@@ -104,6 +105,9 @@ class LoRAModel(AdapterModel):
|
|||||||
"""Get LoRA for a given module by name"""
|
"""Get LoRA for a given module by name"""
|
||||||
return self.loras.get(module_name, None)
|
return self.loras.get(module_name, None)
|
||||||
|
|
||||||
|
def check_lora_name(self, lora_name: str) -> bool:
|
||||||
|
return lora_name in self.loras
|
||||||
|
|
||||||
# (yard1): TODO see if we can derive target_embedding_padding automatically
|
# (yard1): TODO see if we can derive target_embedding_padding automatically
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_lora_tensors(
|
def from_lora_tensors(
|
||||||
@@ -335,6 +339,7 @@ class LoRAModelManager(AdapterModelManager):
|
|||||||
# Used for long context lora.
|
# Used for long context lora.
|
||||||
self.scaling_factor_to_offset: Dict[float, int] = {}
|
self.scaling_factor_to_offset: Dict[float, int] = {}
|
||||||
super().__init__(model)
|
super().__init__(model)
|
||||||
|
|
||||||
self.supported_lora_modules = get_supported_lora_modules(self.model)
|
self.supported_lora_modules = get_supported_lora_modules(self.model)
|
||||||
assert self.supported_lora_modules, "No supported LoRA modules found in"
|
assert self.supported_lora_modules, "No supported LoRA modules found in"
|
||||||
f"{self.model.__class__.__name__}."
|
f"{self.model.__class__.__name__}."
|
||||||
@@ -350,6 +355,7 @@ class LoRAModelManager(AdapterModelManager):
|
|||||||
# In case the model only supports LoRA for
|
# In case the model only supports LoRA for
|
||||||
# text modules (e.g. ChatGLM)
|
# text modules (e.g. ChatGLM)
|
||||||
and hasattr(self.model, "get_mm_mapping"))
|
and hasattr(self.model, "get_mm_mapping"))
|
||||||
|
self.is_pooling_model = is_pooling_model(self.model)
|
||||||
self.packed_modules: Dict[str, List[str]] = {}
|
self.packed_modules: Dict[str, List[str]] = {}
|
||||||
self.modules: Dict[str, BaseLayerWithLoRA] = {}
|
self.modules: Dict[str, BaseLayerWithLoRA] = {}
|
||||||
# Dict instead of a Set for compatibility with LRUCache.
|
# Dict instead of a Set for compatibility with LRUCache.
|
||||||
@@ -389,7 +395,7 @@ class LoRAModelManager(AdapterModelManager):
|
|||||||
lora_model.id, index)
|
lora_model.id, index)
|
||||||
self.lora_index_to_id[index] = lora_model.id
|
self.lora_index_to_id[index] = lora_model.id
|
||||||
for module_name, module in self.modules.items():
|
for module_name, module in self.modules.items():
|
||||||
module_lora = lora_model.get_lora(module_name)
|
module_lora = self._get_lora_layer_weights(lora_model, module_name)
|
||||||
if module_lora:
|
if module_lora:
|
||||||
module_lora.optimize()
|
module_lora.optimize()
|
||||||
# Bias is not explicitly enabled with the flag enable_lora_bias.
|
# Bias is not explicitly enabled with the flag enable_lora_bias.
|
||||||
@@ -626,7 +632,7 @@ class LoRAModelManager(AdapterModelManager):
|
|||||||
replaced_module: Set[str] = set()
|
replaced_module: Set[str] = set()
|
||||||
has_replacement = False
|
has_replacement = False
|
||||||
for r in new_module_names:
|
for r in new_module_names:
|
||||||
lora = lora_model.get_lora(r)
|
lora = self._get_lora_layer_weights(lora_model, r)
|
||||||
replacement_loras.append(lora)
|
replacement_loras.append(lora)
|
||||||
if lora:
|
if lora:
|
||||||
has_replacement = True
|
has_replacement = True
|
||||||
@@ -637,12 +643,34 @@ class LoRAModelManager(AdapterModelManager):
|
|||||||
if replacement_loras[i]:
|
if replacement_loras[i]:
|
||||||
continue
|
continue
|
||||||
replacement_loras[i] = None
|
replacement_loras[i] = None
|
||||||
|
# HACK Temporary solution for the pool model.
|
||||||
|
if self.is_pooling_model and not lora_model.check_lora_name(
|
||||||
|
module_name):
|
||||||
|
replaced_module_name = module_name.replace("model.", "")
|
||||||
|
if lora_model.check_lora_name(module_name):
|
||||||
|
module_name = replaced_module_name
|
||||||
lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
|
lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
|
||||||
replacement_loras)
|
replacement_loras)
|
||||||
# Remove the modules that have been replaced.
|
# Remove the modules that have been replaced.
|
||||||
for module in replaced_module:
|
for module in replaced_module:
|
||||||
lora_model.loras.pop(module, None)
|
lora_model.loras.pop(module, None)
|
||||||
|
|
||||||
|
def _get_lora_layer_weights(
|
||||||
|
self, lora_model: LoRAModel,
|
||||||
|
module_name: str) -> Optional[LoRALayerWeights]:
|
||||||
|
org_module_name = module_name
|
||||||
|
if self.is_pooling_model and not lora_model.check_lora_name(
|
||||||
|
module_name):
|
||||||
|
# If it's a pool model, and the layer name is not found,
|
||||||
|
# remove the prefix 'model.' and search again.
|
||||||
|
module_name = module_name.replace("model.", "")
|
||||||
|
if lora_model.check_lora_name(module_name):
|
||||||
|
org_module_name = module_name
|
||||||
|
logger.info_once(
|
||||||
|
"For the pool model, successfully loaded the LoRA weights "
|
||||||
|
"after removing the prefix 'model.'.")
|
||||||
|
return lora_model.get_lora(org_module_name)
|
||||||
|
|
||||||
def deactivate_adapter(self, adapter_id: int) -> bool:
|
def deactivate_adapter(self, adapter_id: int) -> bool:
|
||||||
return deactivate_adapter(adapter_id, self._active_adapters,
|
return deactivate_adapter(adapter_id, self._active_adapters,
|
||||||
self._deactivate_adapter)
|
self._deactivate_adapter)
|
||||||
|
|||||||
@@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 128,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -783,8 +783,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
|
|||||||
use_int8_w8a16=use_int8_w8a16,
|
use_int8_w8a16=use_int8_w8a16,
|
||||||
**config,
|
**config,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
config = config.copy()
|
||||||
|
BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
|
||||||
|
if block_shape is not None:
|
||||||
|
BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0],
|
||||||
|
block_shape[1]))
|
||||||
fused_moe_kernel[grid](
|
fused_moe_kernel[grid](
|
||||||
A,
|
A,
|
||||||
B,
|
B,
|
||||||
@@ -823,6 +827,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
|
|||||||
compute_type=compute_type,
|
compute_type=compute_type,
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
use_fp8_w8a8=use_fp8_w8a8,
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
use_int8_w8a16=use_int8_w8a16,
|
||||||
|
BLOCK_SIZE_K=BLOCK_SIZE_K,
|
||||||
**config,
|
**config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"kpack": 1,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"num_warps": 4
|
||||||
|
}
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user