[Bugfix] VLLM_V1 supports passing other compilation levels (#19340)
Signed-off-by: Richard Zou <zou3519@gmail.com>
This commit is contained in:
@@ -26,6 +26,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
|
||||
assert not vllm_config.compilation_config.use_cudagraph
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
|
||||
# on the state of the cache directory on the current machine, which
|
||||
# may be influenced by other tests.
|
||||
@@ -33,8 +35,8 @@ def test_use_cudagraphs_dynamic(monkeypatch):
|
||||
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
|
||||
# spawn means that the counters are in the same process.
|
||||
monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
|
||||
|
||||
compilation_config = {
|
||||
@@ -50,6 +52,8 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||
pass
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
@pytest.mark.parametrize("enabled", [True, False])
|
||||
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
@@ -72,3 +76,50 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||
compilation_config=compilation_config,
|
||||
gpu_memory_utilization=0.4) as _):
|
||||
pass
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
def test_dynamo_as_is(vllm_runner, monkeypatch):
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
|
||||
with (
|
||||
compilation_counter.expect(dynamo_as_is_count=1),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner('facebook/opt-125m',
|
||||
compilation_config={"level": 1},
|
||||
gpu_memory_utilization=0.4) as _):
|
||||
pass
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
def test_no_compilation(vllm_runner, monkeypatch):
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
|
||||
with (
|
||||
compilation_counter.expect(num_graphs_seen=0,
|
||||
dynamo_as_is_count=0),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner('facebook/opt-125m',
|
||||
compilation_config={"level": 0},
|
||||
gpu_memory_utilization=0.4) as _):
|
||||
pass
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
def test_enforce_eager(vllm_runner, monkeypatch):
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
|
||||
with (
|
||||
compilation_counter.expect(num_graphs_seen=0,
|
||||
dynamo_as_is_count=0),
|
||||
# loading the model causes compilation (if enabled) to happen
|
||||
vllm_runner('facebook/opt-125m',
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.4) as _):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user