bring in the TTL

This commit is contained in:
2026-04-17 01:45:19 +00:00
parent d8f5f88b64
commit b180c4c57f
5 changed files with 623 additions and 1089 deletions

View File

@@ -1,4 +1,5 @@
FROM vllm/vllm-openai:v0.19.0-cu130
#FROM vllm/vllm-openai:v0.19.0-cu130
FROM vllm/vllm-openai:cu130-nightly-x86_64
# Install LMCache for KV cache offloading / sharing across nodes
# Build with system CUDA 13.0 for Blackwell (B200)
@@ -9,21 +10,16 @@ RUN apt-get update && apt-get install -y git \
libcurand-dev-13-0 \
libcufft-dev-13-0 \
libnvjitlink-dev-13-0 && \
git clone https://github.com/neuralwatt/LMCache.git /tmp/lmcache && \
git clone https://github.com/biondizzle/LMCache.git /tmp/lmcache && \
cd /tmp/lmcache && \
git checkout fix/mla-multi-group-kv-cache && \
git checkout feat/redis-ttl && \
CUDA_HOME=/usr/local/cuda \
TORCH_CUDA_ARCH_LIST="10.0" \
pip install --no-cache-dir --no-build-isolation . && \
rm -rf /tmp/lmcache
# Copy over nemotron reasonong parser
COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py
# Monkey patch more vllm stuff - https://github.com/vllm-project/vllm/pull/38237/changes#diff-bee6813076031d3ca1edc903c1b02b81e4676519afc562ce3fefe37f20c7b650
RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
# Patch LMCacheConnectorV1 to support HMA (Hybrid Mamba/Attention KV cache manager)
# This is required for hybrid models like Nemotron that use both Mamba and Attention layers.
# Without this patch, LMCacheConnectorV1 fails with:
# "Connector LMCacheConnectorV1 does not support HMA but HMA is enabled"
COPY ./lmcache_connector.py /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
# Copy over deepseek tool call parser with MTP fixes
COPY deepseekv32_tool_parser.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/deepseekv32_tool_parser.py

View File

@@ -1,337 +0,0 @@
apiVersion: v1
kind: Pod
metadata:
annotations:
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
cni.projectcalico.org/podIP: 10.244.248.111/32
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
k8s.v1.cni.cncf.io/network-status: |-
[{
"name": "k8s-pod-network",
"ips": [
"10.244.248.111",
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
],
"default": true,
"dns": {}
},{
"name": "vllm/ipoib-network-vllm",
"interface": "net1",
"ips": [
"10.66.0.6"
],
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
"dns": {}
}]
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
k8s.v1.cni.cncf.io/networks-status: |-
[{
"name": "k8s-pod-network",
"ips": [
"10.244.248.111",
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
],
"default": true,
"dns": {}
},{
"name": "vllm/ipoib-network-vllm",
"interface": "net1",
"ips": [
"10.66.0.6"
],
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
"dns": {}
}]
creationTimestamp: '2026-04-15T22:38:27Z'
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
generation: 1
labels:
app.kubernetes.io/component: serving-engine
app.kubernetes.io/instance: production-stack-sea-inference
app.kubernetes.io/managed-by: helm
app.kubernetes.io/name: nemotron-3-super
app.kubernetes.io/part-of: vllm-stack
environment: test
helm-release-name: production-stack-sea-inference
model: nemotron-3-super
pod-template-hash: 856dc7d695
release: test
topology.kubernetes.io/region: sea
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
namespace: vllm
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: ReplicaSet
name: >-
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
uid: 88c04723-f29b-432a-8318-21a9d389cac4
resourceVersion: '29767269'
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
spec:
containers:
- command:
- vllm
- serve
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
- '--host'
- 0.0.0.0
- '--port'
- '8000'
- '--no-enable-prefix-caching'
- '--tensor-parallel-size'
- '8'
- '--async-scheduling'
- '--dtype=auto'
- '--attention-backend=TRITON_ATTN'
- '--gpu_memory_utilization=0.96'
- '--enable-auto-tool-choice'
- '--tool-call-parser=qwen3_coder'
- '--trust_remote_code'
- '--max-cudagraph-capture-size=128'
- '--enable-chunked-prefill'
- '--mamba-ssm-cache-dtype=float16'
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
- '--reasoning-parser=super_v3'
- '--max-model-len=1048576'
- '--disable-custom-all-reduce'
- '--no-disable-hybrid-kv-cache-manager'
- '--enforce-eager'
- '--kv-transfer-config'
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: '123'
- name: HF_HOME
value: /tmp
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PROMETHEUS_MULTIPROC_DIR
value: /tmp
- name: OMP_NUM_THREADS
value: '32'
- name: HF_TOKEN
value: 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
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: '1'
- name: NCCL_TOPO_FILE
value: /etc/nccl/virtualTopology.xml
- name: PYTORCH_CUDA_ALLOC_CONF
value: expandable_segments:True
- name: LMCACHE_REMOTE_URL
value: redis://10.66.0.100:6379
- name: LMCACHE_REMOTE_SERDE
value: naive
- name: LMCACHE_USE_EXPERIMENTAL
value: 'True'
- name: VLLM_RPC_TIMEOUT
value: '1000000'
- name: LMCACHE_LOG_LEVEL
value: ERROR
- name: LMCACHE_LOCAL_CPU
value: 'True'
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
value: '512'
- name: LMCACHE_LMCACHE_INSTANCE_ID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: vllm
ports:
- containerPort: 8000
name: container-port
protocol: TCP
- containerPort: 55555
name: zmq-port
protocol: TCP
- containerPort: 9999
name: ucx-port
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
memory: 1500Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
requests:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
securityContext:
runAsNonRoot: false
startupProbe:
failureThreshold: 120
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 60
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dlhrd
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
nodeName: b200-nodepool-d51376abbf32
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- emptyDir:
medium: Memory
sizeLimit: 64Gi
name: shm
- name: kube-api-access-dlhrd
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:32Z'
observedGeneration: 1
status: 'True'
type: PodReadyToStartContainers
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
observedGeneration: 1
status: 'True'
type: Initialized
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
message: 'containers with unready status: [vllm]'
observedGeneration: 1
reason: ContainersNotReady
status: 'False'
type: Ready
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
message: 'containers with unready status: [vllm]'
observedGeneration: 1
reason: ContainersNotReady
status: 'False'
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
observedGeneration: 1
status: 'True'
type: PodScheduled
containerStatuses:
- allocatedResources:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
containerID: >-
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
imageID: >-
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
lastState:
terminated:
containerID: >-
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
exitCode: 1
finishedAt: '2026-04-15T22:42:20Z'
reason: Error
startedAt: '2026-04-15T22:38:31Z'
name: vllm
ready: false
resources:
limits:
memory: 1500Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
requests:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
restartCount: 1
started: false
state:
running:
startedAt: '2026-04-15T22:42:24Z'
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dlhrd
readOnly: true
recursiveReadOnly: Disabled
hostIP: 10.4.96.13
hostIPs:
- ip: 10.4.96.13
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
observedGeneration: 1
phase: Running
podIP: 10.244.248.111
podIPs:
- ip: 10.244.248.111
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
qosClass: Burstable
startTime: '2026-04-15T22:38:27Z'

View File

@@ -1,362 +0,0 @@
(Worker_TP0 pid=347) INFO 04-15 23:04:43 [default_loader.py:384] Loading weights took 22.09 seconds
(Worker_TP0 pid=347) INFO 04-15 23:04:43 [flashinfer_utils.py:238] Padding intermediate size from 336 to 384 for up/down projection weights.
(Worker_TP0 pid=347) INFO 04-15 23:04:43 [nvfp4.py:401] Using MoEPrepareAndFinalizeNoDPEPMonolithic
(Worker_TP0 pid=347) WARNING 04-15 23:04:44 [kv_cache.py:94] Checkpoint does not provide a q scaling factor. Setting it to k_scale. This only matters for FP8 Attention backends (flash-attn or flashinfer).
(Worker_TP0 pid=347) WARNING 04-15 23:04:44 [kv_cache.py:108] Using KV cache scaling factor 1.0 for fp8_e4m3. If this is unintended, verify that k/v_scale scaling factors are properly set in the checkpoint.
(Worker_TP0 pid=347) INFO 04-15 23:04:46 [gpu_model_runner.py:4820] Model loading took 10.4 GiB memory and 133.856349 seconds
(Worker_TP0 pid=347) INFO 04-15 23:04:53 [backends.py:1051] Using cache directory: /root/.cache/vllm/torch_compile_cache/3fd416396e/rank_0_0/backbone for vLLM's torch.compile
(Worker_TP0 pid=347) INFO 04-15 23:04:53 [backends.py:1111] Dynamo bytecode transform time: 4.26 s
(Worker_TP0 pid=347) INFO 04-15 23:04:53 [flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm
(Worker_TP0 pid=347) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
(Worker_TP0 pid=347) return func(*args, **kwargs)
(Worker_TP0 pid=347) INFO 04-15 23:04:54 [flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm
(Worker_TP0 pid=347) INFO 04-15 23:04:57 [backends.py:372] Cache the graph of compile range (1, 128) for later use
(Worker_TP0 pid=347) INFO 04-15 23:04:57 [backends.py:372] Cache the graph of compile range (129, 8192) for later use
(Worker_TP0 pid=347) INFO 04-15 23:05:10 [backends.py:390] Compiling a graph for compile range (1, 128) takes 13.86 s
(Worker_TP0 pid=347) INFO 04-15 23:05:11 [backends.py:390] Compiling a graph for compile range (129, 8192) takes 14.38 s
(Worker_TP0 pid=347) INFO 04-15 23:05:13 [decorators.py:640] saved AOT compiled function to /root/.cache/vllm/torch_compile_cache/torch_aot_compile/843944412cd4c5c9ac31fd76eb61f7a06b6ada8d50eaed83ce0c0803840a330f/rank_0_0/model
(Worker_TP0 pid=347) INFO 04-15 23:05:13 [monitor.py:48] torch.compile took 24.48 s in total
(Worker_TP0 pid=347) INFO 04-15 23:05:20 [monitor.py:76] Initial profiling/warmup run took 7.55 s
(Worker_TP3 pid=555) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
(Worker_TP6 pid=774) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
(Worker_TP5 pid=701) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
(Worker_TP0 pid=347) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] EngineCore failed to start.
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] Traceback (most recent call last):
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return func(*args, **kwargs)
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] super().__init__(
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] kv_cache_config = self._initialize_kv_caches(vllm_config)
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return func(*args, **kwargs)
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 247, in _initialize_kv_caches
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] available_gpu_memory = self.model_executor.determine_available_memory()
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 136, in determine_available_memory
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return self.collective_rpc("determine_available_memory")
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return aggregate(get_response())
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] raise RuntimeError(
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] RuntimeError: Worker failed with error 'Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.', please check the stack trace above for the root cause
(Worker_TP6 pid=774) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP5 pid=701) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP0 pid=347) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP3 pid=555) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP1 pid=414) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP4 pid=628) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP2 pid=482) Exception ignored in: <function ExactWeakKeyDictionary.__setitem__.<locals>.<lambda> at 0x7f04b2c57a60>
(Worker_TP2 pid=482) Traceback (most recent call last):
(Worker_TP2 pid=482) File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/utils.py", line 1025, in <lambda>
(Worker_TP2 pid=482) self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
(Worker_TP2 pid=482)
(Worker_TP2 pid=482) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 797, in signal_handler
(Worker_TP2 pid=482) raise SystemExit()
(Worker_TP2 pid=482) SystemExit:
(Worker_TP2 pid=482) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
(Worker_TP7 pid=847) Exception ignored in: <function ExactWeakKeyDictionary.__setitem__.<locals>.<lambda> at 0x7f4be1d7df80>
(Worker_TP7 pid=847) Traceback (most recent call last):
(Worker_TP7 pid=847) File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/utils.py", line 1025, in <lambda>
(Worker_TP7 pid=847) self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
(Worker_TP7 pid=847)
(Worker_TP7 pid=847) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 797, in signal_handler
(Worker_TP7 pid=847) raise SystemExit()
(Worker_TP7 pid=847) SystemExit:
(Worker_TP7 pid=847) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.

616
deepseekv32_tool_parser.py Normal file
View File

@@ -0,0 +1,616 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
DeepSeek-V3.2 Tool Call Parser — re-parse-and-diff version.
Adapted from the GLM-4 streaming fix to make the streaming path robust
against multi-token deltas produced by MTP speculative decoding.
Instead of maintaining incremental state that advances one token at a
time, the streaming path re-parses the *entire* current_text on every
call, finds all <DSMLinvoke> regions (complete and in-progress),
builds a JSON arguments string for each, and diffs against what was
previously sent. This makes the parser agnostic to how many tokens
arrive per step.
Key changes vs. the upstream buffer-until-complete parser:
1. _extract_content() handles partial tag overlaps so content text
is never swallowed or duplicated when a tag boundary lands inside
a multi-token chunk.
2. _extract_invoke_regions() finds both complete and incomplete
invoke blocks, enabling streaming of partial arguments.
3. _build_args_json_so_far() constructs the JSON arguments string
incrementally from complete + partial <DSMLparameter> tags.
4. _compute_args_diff() emits only the newly-added characters.
Drop-in replacement: same class name, same interface.
"""
import json
import uuid
from collections.abc import Sequence
from typing import Any
import regex as re
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall,
DeltaMessage,
DeltaToolCall,
ExtractedToolCallInformation,
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
Tool,
ToolParser,
)
logger = init_logger(__name__)
def partial_tag_overlap(text: str, tag: str) -> int:
"""Length of the longest prefix of *tag* that matches a suffix of *text*.
E.g. text ending in ``"<tool_"`` returns 6 when tag is ``"<tool_call>"``.
Returns 0 when there is no overlap.
"""
max_check = min(len(tag) - 1, len(text))
for k in range(max_check, 0, -1):
if text.endswith(tag[:k]):
return k
return 0
class DeepSeekV32ToolParser(ToolParser):
"""
Re-parse-and-diff tool parser for DeepSeek-V3.2 DSML format.
On every streaming call the parser re-parses ``current_text`` to
find ``<DSMLinvoke>`` regions, builds the JSON arguments string
for each tool call, and diffs against what was previously sent to
emit only new content. This is robust against multi-token deltas
from MTP / EAGLE speculative decoding.
Example tool call format::
<DSMLfunction_calls>
<DSMLinvoke name="get_weather">
<DSMLparameter name="location" string="true">杭州</DSMLparameter>
<DSMLparameter name="date" string="true">2024-01-16</DSMLparameter>
</DSMLinvoke>
</DSMLfunction_calls>
"""
def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
super().__init__(tokenizer, tools)
# ----- Tag constants -----
self.tool_call_start_token: str = "<DSMLfunction_calls>"
self.tool_call_end_token: str = "</DSMLfunction_calls>"
self.invoke_end_token: str = "</DSMLinvoke>"
self.param_end_token: str = "</DSMLparameter>"
# Alias expected by ToolParser base / adjust_request
self.tool_calls_start_token = self.tool_call_start_token
# ----- Compiled regexes -----
# Matches a complete <DSMLfunction_calls>…</DSMLfunction_calls>
self.tool_call_complete_regex = re.compile(
r"<DSMLfunction_calls>(.*?)</DSMLfunction_calls>", re.DOTALL
)
# Opening tag of an invoke block — captures the function name.
self.invoke_start_regex = re.compile(
r'<DSMLinvoke\s+name="([^"]+)"\s*>', re.DOTALL
)
# Complete invoke block.
self.invoke_complete_regex = re.compile(
r'<DSMLinvoke\s+name="([^"]+)"\s*>(.*?)</DSMLinvoke>',
re.DOTALL,
)
# Complete parameter tag — captures (name, string_attr, value).
self.parameter_complete_regex = re.compile(
r'<DSMLparameter\s+name="([^"]+)"\s+string="(true|false)"\s*>'
r"(.*?)"
r"</DSMLparameter>",
re.DOTALL,
)
# Just the opening header of a parameter tag (for partial params).
self.parameter_header_regex = re.compile(
r'<DSMLparameter\s+name="([^"]+)"\s+string="(true|false)"\s*>',
re.DOTALL,
)
# ----- Streaming state (reset per request) -----
self._sent_content_idx: int = 0
self._tool_call_ids: list[str] = []
self.streamed_args_for_tool: list[str] = []
self.prev_tool_call_arr: list[dict[str, Any]] = []
self.current_tool_id: int = -1
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ToolParser "
"constructor during construction."
)
logger.debug(
"Successfully initialized %s", self.__class__.__name__
)
# ------------------------------------------------------------------
# Request adjustment
# ------------------------------------------------------------------
def adjust_request(
self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
request = super().adjust_request(request)
if request.tools and request.tool_choice != "none":
# Ensure DSML tokens are not stripped during decoding.
request.skip_special_tokens = False
return request
# ------------------------------------------------------------------
# Static / utility helpers
# ------------------------------------------------------------------
@staticmethod
def _tools_enabled(request: ChatCompletionRequest) -> bool:
"""Check whether tool calling is active for this request."""
try:
tools = getattr(request, "tools", None)
tool_choice = getattr(request, "tool_choice", None)
return bool(tools) and tool_choice != "none"
except Exception:
logger.exception("Failed to determine if tools are enabled.")
return False
def _generate_tool_call_id(self) -> str:
return f"call_{uuid.uuid4().hex[:24]}"
@staticmethod
def _json_escape_string_content(s: str) -> str:
"""JSON-escape a string value (without surrounding quotes)."""
if not s:
return ""
return json.dumps(s, ensure_ascii=False)[1:-1]
# ------------------------------------------------------------------
# Type conversion helpers
# ------------------------------------------------------------------
def _convert_param_value_checked(self, value: str, param_type: str) -> Any:
"""Convert a raw string value to the type indicated by *param_type*.
Raises on failure so the caller can try the next candidate type.
"""
if value.lower() == "null":
return None
param_type = param_type.lower()
if param_type in ("string", "str", "text"):
return value
elif param_type in ("integer", "int"):
return int(value)
elif param_type in ("number", "float"):
val = float(value)
return val if val != int(val) else int(val)
elif param_type in ("boolean", "bool"):
normed = value.strip().lower()
if normed not in ("false", "0", "true", "1"):
raise ValueError(f"Invalid boolean value: {value!r}")
return normed in ("true", "1")
elif param_type in ("object", "array"):
return json.loads(value)
else:
return json.loads(value)
def _convert_param_value(self, value: str, param_type: str | list[str]) -> Any:
"""Try each candidate type in turn; fall back to the raw string."""
if not isinstance(param_type, list):
param_type = [param_type]
for current_type in param_type:
try:
return self._convert_param_value_checked(value, current_type)
except Exception:
continue
return value
def _get_param_schema_type(
self, func_name: str, param_name: str
) -> str | list[str]:
"""Look up the JSON-schema type for a parameter, defaulting to
``"string"``."""
if self.tools:
for tool in self.tools:
if (
hasattr(tool, "function")
and tool.function.name == func_name
and hasattr(tool.function, "parameters")
):
schema = tool.function.parameters
if isinstance(schema, dict) and "properties" in schema:
prop = schema["properties"].get(param_name, {})
if isinstance(prop, dict):
return prop.get("type", "string")
break
return "string"
def _convert_with_schema(
self, func_name: str, param_name: str, value: str
) -> Any:
"""Convert *value* using the tool schema for *func_name*.*param_name*."""
param_type = self._get_param_schema_type(func_name, param_name)
return self._convert_param_value(value, param_type)
def _is_string_type(self, func_name: str, param_name: str) -> bool:
"""Return True if the schema says this parameter is a string."""
ptype = self._get_param_schema_type(func_name, param_name)
if isinstance(ptype, list):
return "string" in ptype
return ptype in ("string", "str", "text")
# ------------------------------------------------------------------
# Non-streaming extraction (unchanged logic, shared helpers)
# ------------------------------------------------------------------
def extract_tool_calls(
self,
model_output: str,
request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
"""Extract tool calls from complete model output (non-streaming)."""
if self.tool_call_start_token not in model_output:
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
try:
tool_calls: list[ToolCall] = []
for fc_block in self.tool_call_complete_regex.findall(model_output):
for invoke_name, invoke_body in self.invoke_complete_regex.findall(
fc_block
):
# Parse all parameters in this invoke.
raw_params: dict[str, str] = {}
for pname, _str_attr, pval in (
self.parameter_complete_regex.findall(invoke_body)
):
raw_params[pname] = pval
# Convert types via schema.
converted: dict[str, Any] = {}
for pname, pval in raw_params.items():
converted[pname] = self._convert_with_schema(
invoke_name, pname, pval
)
tool_calls.append(
ToolCall(
type="function",
function=FunctionCall(
name=invoke_name,
arguments=json.dumps(
converted, ensure_ascii=False
),
),
)
)
if not tool_calls:
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
first_idx = model_output.find(self.tool_call_start_token)
content = model_output[:first_idx] if first_idx > 0 else None
return ExtractedToolCallInformation(
tools_called=True, tool_calls=tool_calls, content=content
)
except Exception:
logger.exception("Error extracting tool calls from complete output")
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
# ------------------------------------------------------------------
# Streaming helpers — re-parse-and-diff
# ------------------------------------------------------------------
def _reset_streaming_state(self) -> None:
self._sent_content_idx = 0
self._tool_call_ids.clear()
self.streamed_args_for_tool.clear()
self.prev_tool_call_arr.clear()
self.current_tool_id = -1
def _extract_content(self, current_text: str) -> str | None:
"""Return any non-tool-call text that hasn't been sent yet.
Walks *current_text* from ``_sent_content_idx``, collecting text
outside ``<DSMLfunction_calls>`` regions. Uses
``partial_tag_overlap`` to avoid emitting bytes that might turn
out to be the start of the function-calls tag once the next
chunk arrives.
"""
content_segments: list[str] = []
pos = self._sent_content_idx
while pos < len(current_text):
start = current_text.find(self.tool_call_start_token, pos)
if start == -1:
# No (more) tool-call regions — send the tail, minus
# any suffix that could be the beginning of the tag.
tail = current_text[pos:]
overlap = partial_tag_overlap(tail, self.tool_call_start_token)
sendable = tail[: len(tail) - overlap] if overlap else tail
if sendable:
content_segments.append(sendable)
pos = len(current_text) - overlap
break
# Text between previous position and the tag start is content.
if start > pos:
content_segments.append(current_text[pos:start])
# Skip past the tool-call region.
end = current_text.find(self.tool_call_end_token, start)
if end != -1:
pos = end + len(self.tool_call_end_token)
else:
# Region still open — park cursor at start, stop.
pos = start
break
if content_segments:
self._sent_content_idx = pos
return "".join(content_segments)
if pos > self._sent_content_idx:
self._sent_content_idx = pos
return None
def _extract_invoke_regions(
self, text: str
) -> list[tuple[str, str, bool]]:
"""Find all invoke blocks inside the function_calls region.
Returns a list of ``(func_name, inner_text, is_complete)``
tuples. *inner_text* is everything between the invoke open
tag and the close tag (or the end of available text for the
last, potentially incomplete, invoke).
"""
results: list[tuple[str, str, bool]] = []
fc_start = text.find(self.tool_call_start_token)
if fc_start == -1:
return results
region_start = fc_start + len(self.tool_call_start_token)
fc_end = text.find(self.tool_call_end_token, region_start)
region = text[region_start:fc_end] if fc_end != -1 else text[region_start:]
pos = 0
while pos < len(region):
inv_match = self.invoke_start_regex.search(region, pos)
if not inv_match:
break
func_name = inv_match.group(1)
body_start = inv_match.end()
inv_end_pos = region.find(self.invoke_end_token, body_start)
if inv_end_pos != -1:
# Complete invoke block.
body = region[body_start:inv_end_pos]
results.append((func_name, body, True))
pos = inv_end_pos + len(self.invoke_end_token)
else:
# Incomplete — still being generated.
body = region[body_start:]
overlap = partial_tag_overlap(body, self.invoke_end_token)
if overlap:
body = body[:-overlap]
results.append((func_name, body, False))
break
return results
def _build_args_json_so_far(
self,
func_name: str,
inner_text: str,
is_complete: bool,
) -> str:
"""Build a JSON arguments string from the parameters found so far.
Handles both fully-closed ``<DSMLparameter>`` tags and the
single trailing partial parameter whose value is still being
streamed.
"""
# ---- Collect all fully-closed parameters ----
complete_params = self.parameter_complete_regex.findall(inner_text)
parts: list[str] = []
for param_name, string_attr, param_value in complete_params:
key_json = json.dumps(param_name, ensure_ascii=False)
if string_attr == "true":
val_json = json.dumps(param_value, ensure_ascii=False)
else:
converted = self._convert_with_schema(
func_name, param_name, param_value
)
val_json = json.dumps(converted, ensure_ascii=False)
parts.append(f"{key_json}: {val_json}")
# ---- Handle a trailing partial parameter ----
last_param_open = inner_text.rfind("<DSMLparameter")
last_param_close = inner_text.rfind(self.param_end_token)
has_partial = last_param_open != -1 and (
last_param_close == -1 or last_param_close < last_param_open
)
if has_partial:
partial_text = inner_text[last_param_open:]
header_match = self.parameter_header_regex.search(partial_text)
if header_match:
param_name = header_match.group(1)
string_attr = header_match.group(2)
partial_value = partial_text[header_match.end():]
# Strip any bytes that might be the beginning of the
# closing </DSMLparameter> tag.
overlap = partial_tag_overlap(
partial_value, self.param_end_token
)
if overlap:
partial_value = partial_value[:-overlap]
key_json = json.dumps(param_name, ensure_ascii=False)
if is_complete:
# Invoke is closed — treat whatever we have as final.
if string_attr == "true":
val_json = json.dumps(
partial_value, ensure_ascii=False
)
else:
converted = self._convert_with_schema(
func_name, param_name, partial_value
)
val_json = json.dumps(converted, ensure_ascii=False)
parts.append(f"{key_json}: {val_json}")
elif string_attr == "true" or self._is_string_type(
func_name, param_name
):
# Stream as an open JSON string (no closing quote).
escaped = self._json_escape_string_content(partial_value)
parts.append(f'{key_json}: "{escaped}')
else:
# Non-string — emit raw partial value.
parts.append(f"{key_json}: {partial_value}")
# ---- Assemble ----
if not parts:
return "{}" if is_complete else ""
joined = "{" + ", ".join(parts)
if is_complete:
joined += "}"
return joined
def _compute_args_diff(self, index: int, args_so_far: str) -> str | None:
"""Return only the characters in *args_so_far* that haven't been
sent yet, or ``None`` if there's nothing new."""
prev = self.streamed_args_for_tool[index]
if not args_so_far or len(args_so_far) <= len(prev):
return None
diff = args_so_far[len(prev):]
self.streamed_args_for_tool[index] = args_so_far
self.prev_tool_call_arr[index]["arguments"] = args_so_far
return diff
def _ensure_tool_state_for(self, index: int) -> None:
"""Grow the streaming-state arrays so *index* is valid."""
while len(self._tool_call_ids) <= index:
self._tool_call_ids.append(self._generate_tool_call_id())
while len(self.streamed_args_for_tool) <= index:
self.streamed_args_for_tool.append("")
while len(self.prev_tool_call_arr) <= index:
self.prev_tool_call_arr.append({})
# ------------------------------------------------------------------
# Main streaming entry point
# ------------------------------------------------------------------
def extract_tool_calls_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: ChatCompletionRequest,
) -> DeltaMessage | None:
"""Extract tool calls from streaming output using re-parse-and-diff.
On every call we:
1. Re-scan *current_text* for content outside tool-call regions.
2. Find all ``<DSMLinvoke>`` regions (complete + partial).
3. Build JSON args for each, diff against previous, emit deltas.
Because the entire text is re-parsed each time, the result is
correct regardless of how many tokens arrived in this step.
"""
# First chunk of a new stream — reset state.
if not previous_text:
self._reset_streaming_state()
# If tools aren't enabled, just forward content.
if not self._tools_enabled(request):
return DeltaMessage(content=delta_text) if delta_text else None
# 1. Extract any content outside tool-call regions.
content = self._extract_content(current_text)
# 2. Find all invoke regions.
regions = self._extract_invoke_regions(current_text)
tool_call_deltas: list[DeltaToolCall] = []
for i, (func_name, inner_text, is_complete) in enumerate(regions):
self._ensure_tool_state_for(i)
# Emit the tool name (once per tool call).
if "name" not in self.prev_tool_call_arr[i]:
self.prev_tool_call_arr[i]["name"] = func_name
tool_call_deltas.append(
DeltaToolCall(
index=i,
id=self._tool_call_ids[i],
type="function",
function=DeltaFunctionCall(
name=func_name,
arguments="",
),
)
)
# Build the JSON args so far and emit the diff.
args_so_far = self._build_args_json_so_far(
func_name, inner_text, is_complete
)
diff = self._compute_args_diff(i, args_so_far)
if diff:
tool_call_deltas.append(
DeltaToolCall(
index=i,
function=DeltaFunctionCall(arguments=diff),
)
)
if regions:
self.current_tool_id = len(regions) - 1
# 3. Return a delta if we have content or tool-call updates.
if content or tool_call_deltas:
return DeltaMessage(
content=content,
tool_calls=tool_call_deltas,
)
# Empty delta with token ids means EOS or closing tag — return
# non-None so the serving framework can finalize finish_reason.
if not delta_text and delta_token_ids and self.prev_tool_call_arr:
return DeltaMessage(content="")
return None

View File

@@ -1,379 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any
import torch
from vllm.config import VllmConfig
from vllm.distributed.kv_events import (
BlockStored,
KVCacheEvent,
KVConnectorKVEvents,
KVEventAggregator,
)
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1,
KVConnectorMetadata,
KVConnectorRole,
SupportsHMA,
)
from vllm.logger import init_logger
from vllm.v1.attention.backend import AttentionMetadata
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING:
from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request
logger = init_logger(__name__)
class LMCacheKVEvents(KVConnectorKVEvents):
"""
Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
"""
def __init__(self, num_workers: int) -> None:
self._aggregator = KVEventAggregator(num_workers)
def add_events(self, events: list[KVCacheEvent]) -> None:
self._aggregator.add_events(events)
def aggregate(self) -> "LMCacheKVEvents":
"""
Aggregate KV events and retain only common events.
"""
common_events = self._aggregator.get_common_events()
self._aggregator.clear_events()
self._aggregator.add_events(common_events)
self._aggregator.reset_workers()
return self
def increment_workers(self, count: int = 1) -> None:
self._aggregator.increment_workers(count)
def get_all_events(self) -> list[KVCacheEvent]:
return self._aggregator.get_all_events()
def get_number_of_workers(self) -> int:
return self._aggregator.get_number_of_workers()
def clear_events(self) -> None:
self._aggregator.clear_events()
self._aggregator.reset_workers()
def __repr__(self) -> str:
return f"<LMCacheKVEvents events={self.get_all_events()}>"
class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
@classmethod
def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
"""
LMCache requires PIECEWISE CUDA graph mode when layerwise
operations are enabled. The wait_for_layer_load and save_kv_layer
methods perform actual async synchronization that cannot be
captured in CUDA graphs.
"""
return extra_config.get("use_layerwise", False)
def __init__(
self,
vllm_config: "VllmConfig",
role: KVConnectorRole,
kv_cache_config: "KVCacheConfig",
):
super().__init__(
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
)
assert vllm_config.kv_transfer_config is not None
use_native = vllm_config.kv_transfer_config.get_from_extra_config(
"use_native", False
)
if use_native:
logger.info("Initializing native LMCache connector")
# lazy import
from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration
_adapter = lmcache_integration.vllm_v1_adapter
cls = _adapter.LMCacheConnectorV1Impl
else:
logger.info("Initializing latest dev LMCache connector")
# lazy import
from lmcache.integration.vllm.vllm_v1_adapter import (
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
)
cls = LMCacheConnectorLatestImpl
self._lmcache_engine = cls(vllm_config, role, self)
self._kv_cache_events: LMCacheKVEvents | None = None
# ==============================
# Worker-side methods
# ==============================
def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
"""
Initialize with the KV caches. Useful for pre-registering the
KV Caches in the KVConnector (e.g. for NIXL).
Args:
kv_caches: dictionary of layer names, kv cache
"""
if hasattr(self._lmcache_engine, "register_kv_caches"):
self._lmcache_engine.register_kv_caches(kv_caches)
else:
logger.warning(
"LMCache engine does not support register_kv_caches, "
"please check and use the latest version"
)
def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
"""
Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the
forward pass to enable async loading during model execution.
Args:
forward_context (ForwardContext): the forward context.
**kwargs: additional arguments for the load operation
Note:
The number of elements in kv_caches and layer_names should be
the same.
"""
self._lmcache_engine.start_load_kv(forward_context, **kwargs)
def wait_for_layer_load(self, layer_name: str) -> None:
"""
Block until the KV for a specific layer is loaded into vLLM's
paged buffer. This is called from within attention layer to ensure
async copying from start_load_kv is complete.
This interface will be useful for layer-by-layer pipelining.
Args:
layer_name: the name of that layer
"""
self._lmcache_engine.wait_for_layer_load(layer_name)
def save_kv_layer(
self,
layer_name: str,
kv_layer: torch.Tensor,
attn_metadata: AttentionMetadata,
**kwargs: Any,
) -> None:
"""
Start saving the a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to
enable async copying during execution.
Args:
layer_name (str): the name of the layer.
kv_layer (torch.Tensor): the paged KV buffer of the current
layer in vLLM.
attn_metadata (AttentionMetadata): the attention metadata.
**kwargs: additional arguments for the save operation.
"""
self._lmcache_engine.save_kv_layer(
layer_name, kv_layer, attn_metadata, **kwargs
)
def wait_for_save(self):
"""
Block until all the save operations is done. This is called
as the forward context exits to ensure that the async saving
from save_kv_layer is complete before finishing the forward.
This prevents overwrites of paged KV buffer before saving done.
"""
self._lmcache_engine.wait_for_save()
def get_finished(
self, finished_req_ids: set[str]
) -> tuple[set[str] | None, set[str] | None]:
"""
Notifies worker-side connector ids of requests that have
finished generating tokens.
Returns:
ids of requests that have finished asynchronous transfer
(requests that previously returned True from request_finished()),
tuple of (sending/saving ids, recving/loading ids).
The finished saves/sends req ids must belong to a set provided in a
call to this method (this call or a prior one).
"""
return self._lmcache_engine.get_finished(finished_req_ids)
def get_block_ids_with_load_errors(self) -> set[int]:
"""
Get the set of block IDs that failed to load.
Returns:
Set of block IDs that encountered load errors.
Empty set if no load errors occurred.
"""
method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
if callable(method):
return method()
# Fallback for older versions that don't support this method
return set()
def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
"""
Get the KV connector kv cache events collected during the last interval.
"""
events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined]
if not events:
return None
blocks: list[BlockStored] = [
BlockStored(
block_hashes=e.block_hashes,
parent_block_hash=e.parent_block_hash,
token_ids=e.token_ids,
lora_id=e.lora_id,
block_size=e.block_size,
medium=e.medium,
lora_name=getattr(e, "lora_name", None),
)
for e in events
]
lmcache_kv_events = LMCacheKVEvents(num_workers=1)
lmcache_kv_events.add_events(blocks)
return lmcache_kv_events
# ==============================
# Scheduler-side methods
# ==============================
def get_num_new_matched_tokens(
self,
request: "Request",
num_computed_tokens: int,
) -> tuple[int | None, bool]:
"""
Get number of new tokens that can be loaded from the
external KV cache beyond the num_computed_tokens.
Args:
request (Request): the request object.
num_computed_tokens (int): the number of locally
computed tokens for this request
Returns:
the number of tokens that can be loaded from the
external KV cache beyond what is already computed.
"""
return self._lmcache_engine.get_num_new_matched_tokens(
request, num_computed_tokens
), False
def update_state_after_alloc(
self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
):
"""
Update KVConnector state after block allocation.
"""
self._lmcache_engine.update_state_after_alloc(request, num_external_tokens)
def build_connector_meta(
self, scheduler_output: SchedulerOutput
) -> KVConnectorMetadata:
"""
Build the connector metadata for this step.
This function should NOT modify fields in the scheduler_output.
Also, calling this function will reset the state of the connector.
Args:
scheduler_output (SchedulerOutput): the scheduler output object.
"""
return self._lmcache_engine.build_connector_meta(scheduler_output)
def update_connector_output(self, connector_output: KVConnectorOutput):
"""
Update KVConnector state from worker-side connectors output.
Args:
connector_output (KVConnectorOutput): the worker-side
connectors output.
"""
# Get the KV events
kv_cache_events = connector_output.kv_cache_events
if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
return
if self._kv_cache_events is None:
self._kv_cache_events = kv_cache_events
else:
self._kv_cache_events.add_events(kv_cache_events.get_all_events())
self._kv_cache_events.increment_workers(
kv_cache_events.get_number_of_workers()
)
return
def request_finished(
self,
request: "Request",
block_ids: list[int],
) -> tuple[bool, dict[str, Any] | None]:
"""
Called when a request has finished, before its blocks are freed.
Returns:
True if the request is being saved/sent asynchronously and blocks
should not be freed until the request_id is returned from
get_finished().
Optional KVTransferParams to be included in the request outputs
returned by the engine.
"""
return self._lmcache_engine.request_finished(request, block_ids)
def request_finished_all_groups(
self,
request: "Request",
block_ids: tuple[list[int], ...],
) -> tuple[bool, dict[str, Any] | None]:
"""
Called exactly once when a request has finished for all KV cache
groups (HMA support for hybrid Mamba/Attention models).
LMCache only stores/offloads attention KV cache blocks, so we
extract the first group's block IDs and delegate to the
single-group request_finished.
Args:
request: the request object.
block_ids: tuple of block ID lists, one per KV cache group.
Returns:
Same as request_finished.
"""
# LMCache only handles attention (first) group blocks.
# Mamba SSM state is managed separately by the scheduler.
return self.request_finished(request, block_ids[0])
def take_events(self) -> Iterable["KVCacheEvent"]:
"""
Take the KV cache events from the connector.
Yields:
New KV cache events since the last call.
"""
if self._kv_cache_events is not None:
self._kv_cache_events.aggregate()
kv_cache_events = self._kv_cache_events.get_all_events()
yield from kv_cache_events
self._kv_cache_events.clear_events()
self._kv_cache_events = None