bring in the TTL
This commit is contained in:
18
Dockerfile
18
Dockerfile
@@ -1,4 +1,5 @@
|
||||
FROM vllm/vllm-openai:v0.19.0-cu130
|
||||
#FROM vllm/vllm-openai:v0.19.0-cu130
|
||||
FROM vllm/vllm-openai:cu130-nightly-x86_64
|
||||
|
||||
# Install LMCache for KV cache offloading / sharing across nodes
|
||||
# Build with system CUDA 13.0 for Blackwell (B200)
|
||||
@@ -9,21 +10,16 @@ RUN apt-get update && apt-get install -y git \
|
||||
libcurand-dev-13-0 \
|
||||
libcufft-dev-13-0 \
|
||||
libnvjitlink-dev-13-0 && \
|
||||
git clone https://github.com/neuralwatt/LMCache.git /tmp/lmcache && \
|
||||
git clone https://github.com/biondizzle/LMCache.git /tmp/lmcache && \
|
||||
cd /tmp/lmcache && \
|
||||
git checkout fix/mla-multi-group-kv-cache && \
|
||||
git checkout feat/redis-ttl && \
|
||||
CUDA_HOME=/usr/local/cuda \
|
||||
TORCH_CUDA_ARCH_LIST="10.0" \
|
||||
pip install --no-cache-dir --no-build-isolation . && \
|
||||
rm -rf /tmp/lmcache
|
||||
|
||||
# Copy over nemotron reasonong parser
|
||||
COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py
|
||||
|
||||
# Monkey patch more vllm stuff - https://github.com/vllm-project/vllm/pull/38237/changes#diff-bee6813076031d3ca1edc903c1b02b81e4676519afc562ce3fefe37f20c7b650
|
||||
RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
|
||||
|
||||
# Patch LMCacheConnectorV1 to support HMA (Hybrid Mamba/Attention KV cache manager)
|
||||
# This is required for hybrid models like Nemotron that use both Mamba and Attention layers.
|
||||
# Without this patch, LMCacheConnectorV1 fails with:
|
||||
# "Connector LMCacheConnectorV1 does not support HMA but HMA is enabled"
|
||||
COPY ./lmcache_connector.py /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
|
||||
# Copy over deepseek tool call parser with MTP fixes
|
||||
COPY deepseekv32_tool_parser.py /usr/local/lib/python3.12/dist-packages/vllm/tool_parsers/deepseekv32_tool_parser.py
|
||||
|
||||
@@ -1,337 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
annotations:
|
||||
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
|
||||
cni.projectcalico.org/podIP: 10.244.248.111/32
|
||||
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
|
||||
k8s.v1.cni.cncf.io/network-status: |-
|
||||
[{
|
||||
"name": "k8s-pod-network",
|
||||
"ips": [
|
||||
"10.244.248.111",
|
||||
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||||
],
|
||||
"default": true,
|
||||
"dns": {}
|
||||
},{
|
||||
"name": "vllm/ipoib-network-vllm",
|
||||
"interface": "net1",
|
||||
"ips": [
|
||||
"10.66.0.6"
|
||||
],
|
||||
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||||
"dns": {}
|
||||
}]
|
||||
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
|
||||
k8s.v1.cni.cncf.io/networks-status: |-
|
||||
[{
|
||||
"name": "k8s-pod-network",
|
||||
"ips": [
|
||||
"10.244.248.111",
|
||||
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||||
],
|
||||
"default": true,
|
||||
"dns": {}
|
||||
},{
|
||||
"name": "vllm/ipoib-network-vllm",
|
||||
"interface": "net1",
|
||||
"ips": [
|
||||
"10.66.0.6"
|
||||
],
|
||||
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||||
"dns": {}
|
||||
}]
|
||||
creationTimestamp: '2026-04-15T22:38:27Z'
|
||||
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
|
||||
generation: 1
|
||||
labels:
|
||||
app.kubernetes.io/component: serving-engine
|
||||
app.kubernetes.io/instance: production-stack-sea-inference
|
||||
app.kubernetes.io/managed-by: helm
|
||||
app.kubernetes.io/name: nemotron-3-super
|
||||
app.kubernetes.io/part-of: vllm-stack
|
||||
environment: test
|
||||
helm-release-name: production-stack-sea-inference
|
||||
model: nemotron-3-super
|
||||
pod-template-hash: 856dc7d695
|
||||
release: test
|
||||
topology.kubernetes.io/region: sea
|
||||
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
|
||||
namespace: vllm
|
||||
ownerReferences:
|
||||
- apiVersion: apps/v1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: ReplicaSet
|
||||
name: >-
|
||||
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
|
||||
uid: 88c04723-f29b-432a-8318-21a9d389cac4
|
||||
resourceVersion: '29767269'
|
||||
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- vllm
|
||||
- serve
|
||||
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||||
- '--host'
|
||||
- 0.0.0.0
|
||||
- '--port'
|
||||
- '8000'
|
||||
- '--no-enable-prefix-caching'
|
||||
- '--tensor-parallel-size'
|
||||
- '8'
|
||||
- '--async-scheduling'
|
||||
- '--dtype=auto'
|
||||
- '--attention-backend=TRITON_ATTN'
|
||||
- '--gpu_memory_utilization=0.96'
|
||||
- '--enable-auto-tool-choice'
|
||||
- '--tool-call-parser=qwen3_coder'
|
||||
- '--trust_remote_code'
|
||||
- '--max-cudagraph-capture-size=128'
|
||||
- '--enable-chunked-prefill'
|
||||
- '--mamba-ssm-cache-dtype=float16'
|
||||
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
|
||||
- '--reasoning-parser=super_v3'
|
||||
- '--max-model-len=1048576'
|
||||
- '--disable-custom-all-reduce'
|
||||
- '--no-disable-hybrid-kv-cache-manager'
|
||||
- '--enforce-eager'
|
||||
- '--kv-transfer-config'
|
||||
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
|
||||
env:
|
||||
- name: PYTHONHASHSEED
|
||||
value: '123'
|
||||
- name: HF_HOME
|
||||
value: /tmp
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: status.podIP
|
||||
- name: PROMETHEUS_MULTIPROC_DIR
|
||||
value: /tmp
|
||||
- name: OMP_NUM_THREADS
|
||||
value: '32'
|
||||
- name: HF_TOKEN
|
||||
value: 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
|
||||
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
|
||||
value: '1'
|
||||
- name: NCCL_TOPO_FILE
|
||||
value: /etc/nccl/virtualTopology.xml
|
||||
- name: PYTORCH_CUDA_ALLOC_CONF
|
||||
value: expandable_segments:True
|
||||
- name: LMCACHE_REMOTE_URL
|
||||
value: redis://10.66.0.100:6379
|
||||
- name: LMCACHE_REMOTE_SERDE
|
||||
value: naive
|
||||
- name: LMCACHE_USE_EXPERIMENTAL
|
||||
value: 'True'
|
||||
- name: VLLM_RPC_TIMEOUT
|
||||
value: '1000000'
|
||||
- name: LMCACHE_LOG_LEVEL
|
||||
value: ERROR
|
||||
- name: LMCACHE_LOCAL_CPU
|
||||
value: 'True'
|
||||
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
|
||||
value: '512'
|
||||
- name: LMCACHE_LMCACHE_INSTANCE_ID
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.name
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||||
imagePullPolicy: Always
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: vllm
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: container-port
|
||||
protocol: TCP
|
||||
- containerPort: 55555
|
||||
name: zmq-port
|
||||
protocol: TCP
|
||||
- containerPort: 9999
|
||||
name: ucx-port
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
requests:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
securityContext:
|
||||
runAsNonRoot: false
|
||||
startupProbe:
|
||||
failureThreshold: 120
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 60
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: shm
|
||||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||||
name: kube-api-access-dlhrd
|
||||
readOnly: true
|
||||
dnsPolicy: ClusterFirst
|
||||
enableServiceLinks: true
|
||||
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
|
||||
nodeName: b200-nodepool-d51376abbf32
|
||||
preemptionPolicy: PreemptLowerPriority
|
||||
priority: 0
|
||||
restartPolicy: Always
|
||||
schedulerName: default-scheduler
|
||||
securityContext: {}
|
||||
serviceAccount: default
|
||||
serviceAccountName: default
|
||||
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
|
||||
terminationGracePeriodSeconds: 30
|
||||
tolerations:
|
||||
- effect: NoExecute
|
||||
key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
tolerationSeconds: 300
|
||||
- effect: NoExecute
|
||||
key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
tolerationSeconds: 300
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 64Gi
|
||||
name: shm
|
||||
- name: kube-api-access-dlhrd
|
||||
projected:
|
||||
defaultMode: 420
|
||||
sources:
|
||||
- serviceAccountToken:
|
||||
expirationSeconds: 3607
|
||||
path: token
|
||||
- configMap:
|
||||
items:
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
name: kube-root-ca.crt
|
||||
- downwardAPI:
|
||||
items:
|
||||
- fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.namespace
|
||||
path: namespace
|
||||
status:
|
||||
conditions:
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:32Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: PodReadyToStartContainers
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: Initialized
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
message: 'containers with unready status: [vllm]'
|
||||
observedGeneration: 1
|
||||
reason: ContainersNotReady
|
||||
status: 'False'
|
||||
type: Ready
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
message: 'containers with unready status: [vllm]'
|
||||
observedGeneration: 1
|
||||
reason: ContainersNotReady
|
||||
status: 'False'
|
||||
type: ContainersReady
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: PodScheduled
|
||||
containerStatuses:
|
||||
- allocatedResources:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
containerID: >-
|
||||
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||||
imageID: >-
|
||||
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
|
||||
lastState:
|
||||
terminated:
|
||||
containerID: >-
|
||||
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
|
||||
exitCode: 1
|
||||
finishedAt: '2026-04-15T22:42:20Z'
|
||||
reason: Error
|
||||
startedAt: '2026-04-15T22:38:31Z'
|
||||
name: vllm
|
||||
ready: false
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
requests:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
restartCount: 1
|
||||
started: false
|
||||
state:
|
||||
running:
|
||||
startedAt: '2026-04-15T22:42:24Z'
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: shm
|
||||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||||
name: kube-api-access-dlhrd
|
||||
readOnly: true
|
||||
recursiveReadOnly: Disabled
|
||||
hostIP: 10.4.96.13
|
||||
hostIPs:
|
||||
- ip: 10.4.96.13
|
||||
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
|
||||
observedGeneration: 1
|
||||
phase: Running
|
||||
podIP: 10.244.248.111
|
||||
podIPs:
|
||||
- ip: 10.244.248.111
|
||||
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
|
||||
qosClass: Burstable
|
||||
startTime: '2026-04-15T22:38:27Z'
|
||||
@@ -1,362 +0,0 @@
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:43 [default_loader.py:384] Loading weights took 22.09 seconds
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:43 [flashinfer_utils.py:238] Padding intermediate size from 336 to 384 for up/down projection weights.
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:43 [nvfp4.py:401] Using MoEPrepareAndFinalizeNoDPEPMonolithic
|
||||
(Worker_TP0 pid=347) WARNING 04-15 23:04:44 [kv_cache.py:94] Checkpoint does not provide a q scaling factor. Setting it to k_scale. This only matters for FP8 Attention backends (flash-attn or flashinfer).
|
||||
(Worker_TP0 pid=347) WARNING 04-15 23:04:44 [kv_cache.py:108] Using KV cache scaling factor 1.0 for fp8_e4m3. If this is unintended, verify that k/v_scale scaling factors are properly set in the checkpoint.
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:46 [gpu_model_runner.py:4820] Model loading took 10.4 GiB memory and 133.856349 seconds
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:53 [backends.py:1051] Using cache directory: /root/.cache/vllm/torch_compile_cache/3fd416396e/rank_0_0/backbone for vLLM's torch.compile
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:53 [backends.py:1111] Dynamo bytecode transform time: 4.26 s
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:53 [flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm
|
||||
(Worker_TP0 pid=347) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
|
||||
(Worker_TP0 pid=347) return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:54 [flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:57 [backends.py:372] Cache the graph of compile range (1, 128) for later use
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:04:57 [backends.py:372] Cache the graph of compile range (129, 8192) for later use
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:05:10 [backends.py:390] Compiling a graph for compile range (1, 128) takes 13.86 s
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:05:11 [backends.py:390] Compiling a graph for compile range (129, 8192) takes 14.38 s
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:05:13 [decorators.py:640] saved AOT compiled function to /root/.cache/vllm/torch_compile_cache/torch_aot_compile/843944412cd4c5c9ac31fd76eb61f7a06b6ada8d50eaed83ce0c0803840a330f/rank_0_0/model
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:05:13 [monitor.py:48] torch.compile took 24.48 s in total
|
||||
(Worker_TP0 pid=347) INFO 04-15 23:05:20 [monitor.py:76] Initial profiling/warmup run took 7.55 s
|
||||
(Worker_TP3 pid=555) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP3 pid=555) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
|
||||
(Worker_TP6 pid=774) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP6 pid=774) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
|
||||
(Worker_TP5 pid=701) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP5 pid=701) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
|
||||
(Worker_TP0 pid=347) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP0 pid=347) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] EngineCore failed to start.
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] Traceback (most recent call last):
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return func(*args, **kwargs)
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] super().__init__(
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] kv_cache_config = self._initialize_kv_caches(vllm_config)
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return func(*args, **kwargs)
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 247, in _initialize_kv_caches
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] available_gpu_memory = self.model_executor.determine_available_memory()
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 136, in determine_available_memory
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return self.collective_rpc("determine_available_memory")
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] return aggregate(get_response())
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] ^^^^^^^^^^^^^^
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] raise RuntimeError(
|
||||
(EngineCore pid=276) ERROR 04-15 23:05:24 [core.py:1108] RuntimeError: Worker failed with error 'Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.', please check the stack trace above for the root cause
|
||||
(Worker_TP6 pid=774) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP5 pid=701) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP0 pid=347) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP3 pid=555) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP1 pid=414) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP4 pid=628) WARNING 04-15 23:05:24 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP2 pid=482) Exception ignored in: <function ExactWeakKeyDictionary.__setitem__.<locals>.<lambda> at 0x7f04b2c57a60>
|
||||
(Worker_TP2 pid=482) Traceback (most recent call last):
|
||||
(Worker_TP2 pid=482) File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/utils.py", line 1025, in <lambda>
|
||||
(Worker_TP2 pid=482) self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
|
||||
(Worker_TP2 pid=482)
|
||||
(Worker_TP2 pid=482) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 797, in signal_handler
|
||||
(Worker_TP2 pid=482) raise SystemExit()
|
||||
(Worker_TP2 pid=482) SystemExit:
|
||||
(Worker_TP2 pid=482) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP2 pid=482) ERROR 04-15 23:05:24 [multiproc_executor.py:949]
|
||||
(Worker_TP7 pid=847) Exception ignored in: <function ExactWeakKeyDictionary.__setitem__.<locals>.<lambda> at 0x7f4be1d7df80>
|
||||
(Worker_TP7 pid=847) Traceback (most recent call last):
|
||||
(Worker_TP7 pid=847) File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/utils.py", line 1025, in <lambda>
|
||||
(Worker_TP7 pid=847) self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
|
||||
(Worker_TP7 pid=847)
|
||||
(Worker_TP7 pid=847) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 797, in signal_handler
|
||||
(Worker_TP7 pid=847) raise SystemExit()
|
||||
(Worker_TP7 pid=847) SystemExit:
|
||||
(Worker_TP7 pid=847) WARNING 04-15 23:05:24 [kv_cache_utils.py:1175] Hybrid KV cache manager is disabled for this hybrid model, This means we do not enable any optimizations for saving KV cache memory (e.g., dropping the KV cache outside the sliding window). The compute of layers like sliding window is still saved.
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 381, in determine_available_memory
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5864, in profile_cudagraph_memory
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] self._init_minimal_kv_cache_for_profiling()
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5804, in _init_minimal_kv_cache_for_profiling
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1236, in get_kv_cache_groups
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/core/kv_cache_utils.py", line 1216, in unify_hybrid_kv_cache_specs
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP7 pid=847) ERROR 04-15 23:05:24 [multiproc_executor.py:949] ValueError: Hybrid KV cache manager is disabled but failed to convert the KV cache specs to one unified type.
|
||||
616
deepseekv32_tool_parser.py
Normal file
616
deepseekv32_tool_parser.py
Normal file
@@ -0,0 +1,616 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
DeepSeek-V3.2 Tool Call Parser — re-parse-and-diff version.
|
||||
|
||||
Adapted from the GLM-4 streaming fix to make the streaming path robust
|
||||
against multi-token deltas produced by MTP speculative decoding.
|
||||
|
||||
Instead of maintaining incremental state that advances one token at a
|
||||
time, the streaming path re-parses the *entire* current_text on every
|
||||
call, finds all <|DSML|invoke> regions (complete and in-progress),
|
||||
builds a JSON arguments string for each, and diffs against what was
|
||||
previously sent. This makes the parser agnostic to how many tokens
|
||||
arrive per step.
|
||||
|
||||
Key changes vs. the upstream buffer-until-complete parser:
|
||||
1. _extract_content() handles partial tag overlaps so content text
|
||||
is never swallowed or duplicated when a tag boundary lands inside
|
||||
a multi-token chunk.
|
||||
2. _extract_invoke_regions() finds both complete and incomplete
|
||||
invoke blocks, enabling streaming of partial arguments.
|
||||
3. _build_args_json_so_far() constructs the JSON arguments string
|
||||
incrementally from complete + partial <|DSML|parameter> tags.
|
||||
4. _compute_args_diff() emits only the newly-added characters.
|
||||
|
||||
Drop-in replacement: same class name, same interface.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from collections.abc import Sequence
|
||||
from typing import Any
|
||||
|
||||
import regex as re
|
||||
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
ExtractedToolCallInformation,
|
||||
FunctionCall,
|
||||
ToolCall,
|
||||
)
|
||||
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tool_parsers.abstract_tool_parser import (
|
||||
Tool,
|
||||
ToolParser,
|
||||
)
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def partial_tag_overlap(text: str, tag: str) -> int:
|
||||
"""Length of the longest prefix of *tag* that matches a suffix of *text*.
|
||||
|
||||
E.g. text ending in ``"<tool_"`` returns 6 when tag is ``"<tool_call>"``.
|
||||
Returns 0 when there is no overlap.
|
||||
"""
|
||||
max_check = min(len(tag) - 1, len(text))
|
||||
for k in range(max_check, 0, -1):
|
||||
if text.endswith(tag[:k]):
|
||||
return k
|
||||
return 0
|
||||
|
||||
|
||||
class DeepSeekV32ToolParser(ToolParser):
|
||||
"""
|
||||
Re-parse-and-diff tool parser for DeepSeek-V3.2 DSML format.
|
||||
|
||||
On every streaming call the parser re-parses ``current_text`` to
|
||||
find ``<|DSML|invoke>`` regions, builds the JSON arguments string
|
||||
for each tool call, and diffs against what was previously sent to
|
||||
emit only new content. This is robust against multi-token deltas
|
||||
from MTP / EAGLE speculative decoding.
|
||||
|
||||
Example tool call format::
|
||||
|
||||
<|DSML|function_calls>
|
||||
<|DSML|invoke name="get_weather">
|
||||
<|DSML|parameter name="location" string="true">杭州</|DSML|parameter>
|
||||
<|DSML|parameter name="date" string="true">2024-01-16</|DSML|parameter>
|
||||
</|DSML|invoke>
|
||||
</|DSML|function_calls>
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
|
||||
super().__init__(tokenizer, tools)
|
||||
|
||||
# ----- Tag constants -----
|
||||
self.tool_call_start_token: str = "<|DSML|function_calls>"
|
||||
self.tool_call_end_token: str = "</|DSML|function_calls>"
|
||||
self.invoke_end_token: str = "</|DSML|invoke>"
|
||||
self.param_end_token: str = "</|DSML|parameter>"
|
||||
|
||||
# Alias expected by ToolParser base / adjust_request
|
||||
self.tool_calls_start_token = self.tool_call_start_token
|
||||
|
||||
# ----- Compiled regexes -----
|
||||
# Matches a complete <|DSML|function_calls>…</|DSML|function_calls>
|
||||
self.tool_call_complete_regex = re.compile(
|
||||
r"<|DSML|function_calls>(.*?)</|DSML|function_calls>", re.DOTALL
|
||||
)
|
||||
# Opening tag of an invoke block — captures the function name.
|
||||
self.invoke_start_regex = re.compile(
|
||||
r'<|DSML|invoke\s+name="([^"]+)"\s*>', re.DOTALL
|
||||
)
|
||||
# Complete invoke block.
|
||||
self.invoke_complete_regex = re.compile(
|
||||
r'<|DSML|invoke\s+name="([^"]+)"\s*>(.*?)</|DSML|invoke>',
|
||||
re.DOTALL,
|
||||
)
|
||||
# Complete parameter tag — captures (name, string_attr, value).
|
||||
self.parameter_complete_regex = re.compile(
|
||||
r'<|DSML|parameter\s+name="([^"]+)"\s+string="(true|false)"\s*>'
|
||||
r"(.*?)"
|
||||
r"</|DSML|parameter>",
|
||||
re.DOTALL,
|
||||
)
|
||||
# Just the opening header of a parameter tag (for partial params).
|
||||
self.parameter_header_regex = re.compile(
|
||||
r'<|DSML|parameter\s+name="([^"]+)"\s+string="(true|false)"\s*>',
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
# ----- Streaming state (reset per request) -----
|
||||
self._sent_content_idx: int = 0
|
||||
self._tool_call_ids: list[str] = []
|
||||
self.streamed_args_for_tool: list[str] = []
|
||||
self.prev_tool_call_arr: list[dict[str, Any]] = []
|
||||
self.current_tool_id: int = -1
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ToolParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
"Successfully initialized %s", self.__class__.__name__
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Request adjustment
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def adjust_request(
|
||||
self, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> ChatCompletionRequest | ResponsesRequest:
|
||||
request = super().adjust_request(request)
|
||||
if request.tools and request.tool_choice != "none":
|
||||
# Ensure DSML tokens are not stripped during decoding.
|
||||
request.skip_special_tokens = False
|
||||
return request
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Static / utility helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _tools_enabled(request: ChatCompletionRequest) -> bool:
|
||||
"""Check whether tool calling is active for this request."""
|
||||
try:
|
||||
tools = getattr(request, "tools", None)
|
||||
tool_choice = getattr(request, "tool_choice", None)
|
||||
return bool(tools) and tool_choice != "none"
|
||||
except Exception:
|
||||
logger.exception("Failed to determine if tools are enabled.")
|
||||
return False
|
||||
|
||||
def _generate_tool_call_id(self) -> str:
|
||||
return f"call_{uuid.uuid4().hex[:24]}"
|
||||
|
||||
@staticmethod
|
||||
def _json_escape_string_content(s: str) -> str:
|
||||
"""JSON-escape a string value (without surrounding quotes)."""
|
||||
if not s:
|
||||
return ""
|
||||
return json.dumps(s, ensure_ascii=False)[1:-1]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Type conversion helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _convert_param_value_checked(self, value: str, param_type: str) -> Any:
|
||||
"""Convert a raw string value to the type indicated by *param_type*.
|
||||
|
||||
Raises on failure so the caller can try the next candidate type.
|
||||
"""
|
||||
if value.lower() == "null":
|
||||
return None
|
||||
|
||||
param_type = param_type.lower()
|
||||
if param_type in ("string", "str", "text"):
|
||||
return value
|
||||
elif param_type in ("integer", "int"):
|
||||
return int(value)
|
||||
elif param_type in ("number", "float"):
|
||||
val = float(value)
|
||||
return val if val != int(val) else int(val)
|
||||
elif param_type in ("boolean", "bool"):
|
||||
normed = value.strip().lower()
|
||||
if normed not in ("false", "0", "true", "1"):
|
||||
raise ValueError(f"Invalid boolean value: {value!r}")
|
||||
return normed in ("true", "1")
|
||||
elif param_type in ("object", "array"):
|
||||
return json.loads(value)
|
||||
else:
|
||||
return json.loads(value)
|
||||
|
||||
def _convert_param_value(self, value: str, param_type: str | list[str]) -> Any:
|
||||
"""Try each candidate type in turn; fall back to the raw string."""
|
||||
if not isinstance(param_type, list):
|
||||
param_type = [param_type]
|
||||
for current_type in param_type:
|
||||
try:
|
||||
return self._convert_param_value_checked(value, current_type)
|
||||
except Exception:
|
||||
continue
|
||||
return value
|
||||
|
||||
def _get_param_schema_type(
|
||||
self, func_name: str, param_name: str
|
||||
) -> str | list[str]:
|
||||
"""Look up the JSON-schema type for a parameter, defaulting to
|
||||
``"string"``."""
|
||||
if self.tools:
|
||||
for tool in self.tools:
|
||||
if (
|
||||
hasattr(tool, "function")
|
||||
and tool.function.name == func_name
|
||||
and hasattr(tool.function, "parameters")
|
||||
):
|
||||
schema = tool.function.parameters
|
||||
if isinstance(schema, dict) and "properties" in schema:
|
||||
prop = schema["properties"].get(param_name, {})
|
||||
if isinstance(prop, dict):
|
||||
return prop.get("type", "string")
|
||||
break
|
||||
return "string"
|
||||
|
||||
def _convert_with_schema(
|
||||
self, func_name: str, param_name: str, value: str
|
||||
) -> Any:
|
||||
"""Convert *value* using the tool schema for *func_name*.*param_name*."""
|
||||
param_type = self._get_param_schema_type(func_name, param_name)
|
||||
return self._convert_param_value(value, param_type)
|
||||
|
||||
def _is_string_type(self, func_name: str, param_name: str) -> bool:
|
||||
"""Return True if the schema says this parameter is a string."""
|
||||
ptype = self._get_param_schema_type(func_name, param_name)
|
||||
if isinstance(ptype, list):
|
||||
return "string" in ptype
|
||||
return ptype in ("string", "str", "text")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Non-streaming extraction (unchanged logic, shared helpers)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def extract_tool_calls(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest,
|
||||
) -> ExtractedToolCallInformation:
|
||||
"""Extract tool calls from complete model output (non-streaming)."""
|
||||
if self.tool_call_start_token not in model_output:
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=False, tool_calls=[], content=model_output
|
||||
)
|
||||
|
||||
try:
|
||||
tool_calls: list[ToolCall] = []
|
||||
|
||||
for fc_block in self.tool_call_complete_regex.findall(model_output):
|
||||
for invoke_name, invoke_body in self.invoke_complete_regex.findall(
|
||||
fc_block
|
||||
):
|
||||
# Parse all parameters in this invoke.
|
||||
raw_params: dict[str, str] = {}
|
||||
for pname, _str_attr, pval in (
|
||||
self.parameter_complete_regex.findall(invoke_body)
|
||||
):
|
||||
raw_params[pname] = pval
|
||||
|
||||
# Convert types via schema.
|
||||
converted: dict[str, Any] = {}
|
||||
for pname, pval in raw_params.items():
|
||||
converted[pname] = self._convert_with_schema(
|
||||
invoke_name, pname, pval
|
||||
)
|
||||
|
||||
tool_calls.append(
|
||||
ToolCall(
|
||||
type="function",
|
||||
function=FunctionCall(
|
||||
name=invoke_name,
|
||||
arguments=json.dumps(
|
||||
converted, ensure_ascii=False
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
if not tool_calls:
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=False, tool_calls=[], content=model_output
|
||||
)
|
||||
|
||||
first_idx = model_output.find(self.tool_call_start_token)
|
||||
content = model_output[:first_idx] if first_idx > 0 else None
|
||||
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=True, tool_calls=tool_calls, content=content
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Error extracting tool calls from complete output")
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=False, tool_calls=[], content=model_output
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Streaming helpers — re-parse-and-diff
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _reset_streaming_state(self) -> None:
|
||||
self._sent_content_idx = 0
|
||||
self._tool_call_ids.clear()
|
||||
self.streamed_args_for_tool.clear()
|
||||
self.prev_tool_call_arr.clear()
|
||||
self.current_tool_id = -1
|
||||
|
||||
def _extract_content(self, current_text: str) -> str | None:
|
||||
"""Return any non-tool-call text that hasn't been sent yet.
|
||||
|
||||
Walks *current_text* from ``_sent_content_idx``, collecting text
|
||||
outside ``<|DSML|function_calls>`` regions. Uses
|
||||
``partial_tag_overlap`` to avoid emitting bytes that might turn
|
||||
out to be the start of the function-calls tag once the next
|
||||
chunk arrives.
|
||||
"""
|
||||
content_segments: list[str] = []
|
||||
pos = self._sent_content_idx
|
||||
|
||||
while pos < len(current_text):
|
||||
start = current_text.find(self.tool_call_start_token, pos)
|
||||
if start == -1:
|
||||
# No (more) tool-call regions — send the tail, minus
|
||||
# any suffix that could be the beginning of the tag.
|
||||
tail = current_text[pos:]
|
||||
overlap = partial_tag_overlap(tail, self.tool_call_start_token)
|
||||
sendable = tail[: len(tail) - overlap] if overlap else tail
|
||||
if sendable:
|
||||
content_segments.append(sendable)
|
||||
pos = len(current_text) - overlap
|
||||
break
|
||||
|
||||
# Text between previous position and the tag start is content.
|
||||
if start > pos:
|
||||
content_segments.append(current_text[pos:start])
|
||||
|
||||
# Skip past the tool-call region.
|
||||
end = current_text.find(self.tool_call_end_token, start)
|
||||
if end != -1:
|
||||
pos = end + len(self.tool_call_end_token)
|
||||
else:
|
||||
# Region still open — park cursor at start, stop.
|
||||
pos = start
|
||||
break
|
||||
|
||||
if content_segments:
|
||||
self._sent_content_idx = pos
|
||||
return "".join(content_segments)
|
||||
if pos > self._sent_content_idx:
|
||||
self._sent_content_idx = pos
|
||||
return None
|
||||
|
||||
def _extract_invoke_regions(
|
||||
self, text: str
|
||||
) -> list[tuple[str, str, bool]]:
|
||||
"""Find all invoke blocks inside the function_calls region.
|
||||
|
||||
Returns a list of ``(func_name, inner_text, is_complete)``
|
||||
tuples. *inner_text* is everything between the invoke open
|
||||
tag and the close tag (or the end of available text for the
|
||||
last, potentially incomplete, invoke).
|
||||
"""
|
||||
results: list[tuple[str, str, bool]] = []
|
||||
|
||||
fc_start = text.find(self.tool_call_start_token)
|
||||
if fc_start == -1:
|
||||
return results
|
||||
|
||||
region_start = fc_start + len(self.tool_call_start_token)
|
||||
fc_end = text.find(self.tool_call_end_token, region_start)
|
||||
region = text[region_start:fc_end] if fc_end != -1 else text[region_start:]
|
||||
|
||||
pos = 0
|
||||
while pos < len(region):
|
||||
inv_match = self.invoke_start_regex.search(region, pos)
|
||||
if not inv_match:
|
||||
break
|
||||
|
||||
func_name = inv_match.group(1)
|
||||
body_start = inv_match.end()
|
||||
|
||||
inv_end_pos = region.find(self.invoke_end_token, body_start)
|
||||
if inv_end_pos != -1:
|
||||
# Complete invoke block.
|
||||
body = region[body_start:inv_end_pos]
|
||||
results.append((func_name, body, True))
|
||||
pos = inv_end_pos + len(self.invoke_end_token)
|
||||
else:
|
||||
# Incomplete — still being generated.
|
||||
body = region[body_start:]
|
||||
overlap = partial_tag_overlap(body, self.invoke_end_token)
|
||||
if overlap:
|
||||
body = body[:-overlap]
|
||||
results.append((func_name, body, False))
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
def _build_args_json_so_far(
|
||||
self,
|
||||
func_name: str,
|
||||
inner_text: str,
|
||||
is_complete: bool,
|
||||
) -> str:
|
||||
"""Build a JSON arguments string from the parameters found so far.
|
||||
|
||||
Handles both fully-closed ``<|DSML|parameter>`` tags and the
|
||||
single trailing partial parameter whose value is still being
|
||||
streamed.
|
||||
"""
|
||||
# ---- Collect all fully-closed parameters ----
|
||||
complete_params = self.parameter_complete_regex.findall(inner_text)
|
||||
parts: list[str] = []
|
||||
|
||||
for param_name, string_attr, param_value in complete_params:
|
||||
key_json = json.dumps(param_name, ensure_ascii=False)
|
||||
if string_attr == "true":
|
||||
val_json = json.dumps(param_value, ensure_ascii=False)
|
||||
else:
|
||||
converted = self._convert_with_schema(
|
||||
func_name, param_name, param_value
|
||||
)
|
||||
val_json = json.dumps(converted, ensure_ascii=False)
|
||||
parts.append(f"{key_json}: {val_json}")
|
||||
|
||||
# ---- Handle a trailing partial parameter ----
|
||||
last_param_open = inner_text.rfind("<|DSML|parameter")
|
||||
last_param_close = inner_text.rfind(self.param_end_token)
|
||||
has_partial = last_param_open != -1 and (
|
||||
last_param_close == -1 or last_param_close < last_param_open
|
||||
)
|
||||
|
||||
if has_partial:
|
||||
partial_text = inner_text[last_param_open:]
|
||||
header_match = self.parameter_header_regex.search(partial_text)
|
||||
|
||||
if header_match:
|
||||
param_name = header_match.group(1)
|
||||
string_attr = header_match.group(2)
|
||||
partial_value = partial_text[header_match.end():]
|
||||
|
||||
# Strip any bytes that might be the beginning of the
|
||||
# closing </|DSML|parameter> tag.
|
||||
overlap = partial_tag_overlap(
|
||||
partial_value, self.param_end_token
|
||||
)
|
||||
if overlap:
|
||||
partial_value = partial_value[:-overlap]
|
||||
|
||||
key_json = json.dumps(param_name, ensure_ascii=False)
|
||||
|
||||
if is_complete:
|
||||
# Invoke is closed — treat whatever we have as final.
|
||||
if string_attr == "true":
|
||||
val_json = json.dumps(
|
||||
partial_value, ensure_ascii=False
|
||||
)
|
||||
else:
|
||||
converted = self._convert_with_schema(
|
||||
func_name, param_name, partial_value
|
||||
)
|
||||
val_json = json.dumps(converted, ensure_ascii=False)
|
||||
parts.append(f"{key_json}: {val_json}")
|
||||
elif string_attr == "true" or self._is_string_type(
|
||||
func_name, param_name
|
||||
):
|
||||
# Stream as an open JSON string (no closing quote).
|
||||
escaped = self._json_escape_string_content(partial_value)
|
||||
parts.append(f'{key_json}: "{escaped}')
|
||||
else:
|
||||
# Non-string — emit raw partial value.
|
||||
parts.append(f"{key_json}: {partial_value}")
|
||||
|
||||
# ---- Assemble ----
|
||||
if not parts:
|
||||
return "{}" if is_complete else ""
|
||||
|
||||
joined = "{" + ", ".join(parts)
|
||||
if is_complete:
|
||||
joined += "}"
|
||||
return joined
|
||||
|
||||
def _compute_args_diff(self, index: int, args_so_far: str) -> str | None:
|
||||
"""Return only the characters in *args_so_far* that haven't been
|
||||
sent yet, or ``None`` if there's nothing new."""
|
||||
prev = self.streamed_args_for_tool[index]
|
||||
if not args_so_far or len(args_so_far) <= len(prev):
|
||||
return None
|
||||
diff = args_so_far[len(prev):]
|
||||
self.streamed_args_for_tool[index] = args_so_far
|
||||
self.prev_tool_call_arr[index]["arguments"] = args_so_far
|
||||
return diff
|
||||
|
||||
def _ensure_tool_state_for(self, index: int) -> None:
|
||||
"""Grow the streaming-state arrays so *index* is valid."""
|
||||
while len(self._tool_call_ids) <= index:
|
||||
self._tool_call_ids.append(self._generate_tool_call_id())
|
||||
while len(self.streamed_args_for_tool) <= index:
|
||||
self.streamed_args_for_tool.append("")
|
||||
while len(self.prev_tool_call_arr) <= index:
|
||||
self.prev_tool_call_arr.append({})
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Main streaming entry point
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def extract_tool_calls_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
request: ChatCompletionRequest,
|
||||
) -> DeltaMessage | None:
|
||||
"""Extract tool calls from streaming output using re-parse-and-diff.
|
||||
|
||||
On every call we:
|
||||
1. Re-scan *current_text* for content outside tool-call regions.
|
||||
2. Find all ``<|DSML|invoke>`` regions (complete + partial).
|
||||
3. Build JSON args for each, diff against previous, emit deltas.
|
||||
|
||||
Because the entire text is re-parsed each time, the result is
|
||||
correct regardless of how many tokens arrived in this step.
|
||||
"""
|
||||
# First chunk of a new stream — reset state.
|
||||
if not previous_text:
|
||||
self._reset_streaming_state()
|
||||
|
||||
# If tools aren't enabled, just forward content.
|
||||
if not self._tools_enabled(request):
|
||||
return DeltaMessage(content=delta_text) if delta_text else None
|
||||
|
||||
# 1. Extract any content outside tool-call regions.
|
||||
content = self._extract_content(current_text)
|
||||
|
||||
# 2. Find all invoke regions.
|
||||
regions = self._extract_invoke_regions(current_text)
|
||||
tool_call_deltas: list[DeltaToolCall] = []
|
||||
|
||||
for i, (func_name, inner_text, is_complete) in enumerate(regions):
|
||||
self._ensure_tool_state_for(i)
|
||||
|
||||
# Emit the tool name (once per tool call).
|
||||
if "name" not in self.prev_tool_call_arr[i]:
|
||||
self.prev_tool_call_arr[i]["name"] = func_name
|
||||
tool_call_deltas.append(
|
||||
DeltaToolCall(
|
||||
index=i,
|
||||
id=self._tool_call_ids[i],
|
||||
type="function",
|
||||
function=DeltaFunctionCall(
|
||||
name=func_name,
|
||||
arguments="",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Build the JSON args so far and emit the diff.
|
||||
args_so_far = self._build_args_json_so_far(
|
||||
func_name, inner_text, is_complete
|
||||
)
|
||||
diff = self._compute_args_diff(i, args_so_far)
|
||||
if diff:
|
||||
tool_call_deltas.append(
|
||||
DeltaToolCall(
|
||||
index=i,
|
||||
function=DeltaFunctionCall(arguments=diff),
|
||||
)
|
||||
)
|
||||
|
||||
if regions:
|
||||
self.current_tool_id = len(regions) - 1
|
||||
|
||||
# 3. Return a delta if we have content or tool-call updates.
|
||||
if content or tool_call_deltas:
|
||||
return DeltaMessage(
|
||||
content=content,
|
||||
tool_calls=tool_call_deltas,
|
||||
)
|
||||
|
||||
# Empty delta with token ids means EOS or closing tag — return
|
||||
# non-None so the serving framework can finalize finish_reason.
|
||||
if not delta_text and delta_token_ids and self.prev_tool_call_arr:
|
||||
return DeltaMessage(content="")
|
||||
|
||||
return None
|
||||
@@ -1,379 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_events import (
|
||||
BlockStored,
|
||||
KVCacheEvent,
|
||||
KVConnectorKVEvents,
|
||||
KVEventAggregator,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1,
|
||||
KVConnectorMetadata,
|
||||
KVConnectorRole,
|
||||
SupportsHMA,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.attention.backend import AttentionMetadata
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import KVConnectorOutput
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class LMCacheKVEvents(KVConnectorKVEvents):
|
||||
"""
|
||||
Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
|
||||
"""
|
||||
|
||||
def __init__(self, num_workers: int) -> None:
|
||||
self._aggregator = KVEventAggregator(num_workers)
|
||||
|
||||
def add_events(self, events: list[KVCacheEvent]) -> None:
|
||||
self._aggregator.add_events(events)
|
||||
|
||||
def aggregate(self) -> "LMCacheKVEvents":
|
||||
"""
|
||||
Aggregate KV events and retain only common events.
|
||||
"""
|
||||
common_events = self._aggregator.get_common_events()
|
||||
self._aggregator.clear_events()
|
||||
self._aggregator.add_events(common_events)
|
||||
self._aggregator.reset_workers()
|
||||
return self
|
||||
|
||||
def increment_workers(self, count: int = 1) -> None:
|
||||
self._aggregator.increment_workers(count)
|
||||
|
||||
def get_all_events(self) -> list[KVCacheEvent]:
|
||||
return self._aggregator.get_all_events()
|
||||
|
||||
def get_number_of_workers(self) -> int:
|
||||
return self._aggregator.get_number_of_workers()
|
||||
|
||||
def clear_events(self) -> None:
|
||||
self._aggregator.clear_events()
|
||||
self._aggregator.reset_workers()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<LMCacheKVEvents events={self.get_all_events()}>"
|
||||
|
||||
|
||||
class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
|
||||
@classmethod
|
||||
def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
|
||||
"""
|
||||
LMCache requires PIECEWISE CUDA graph mode when layerwise
|
||||
operations are enabled. The wait_for_layer_load and save_kv_layer
|
||||
methods perform actual async synchronization that cannot be
|
||||
captured in CUDA graphs.
|
||||
"""
|
||||
return extra_config.get("use_layerwise", False)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: "VllmConfig",
|
||||
role: KVConnectorRole,
|
||||
kv_cache_config: "KVCacheConfig",
|
||||
):
|
||||
super().__init__(
|
||||
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
|
||||
)
|
||||
assert vllm_config.kv_transfer_config is not None
|
||||
use_native = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
"use_native", False
|
||||
)
|
||||
if use_native:
|
||||
logger.info("Initializing native LMCache connector")
|
||||
# lazy import
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration
|
||||
|
||||
_adapter = lmcache_integration.vllm_v1_adapter
|
||||
|
||||
cls = _adapter.LMCacheConnectorV1Impl
|
||||
else:
|
||||
logger.info("Initializing latest dev LMCache connector")
|
||||
# lazy import
|
||||
from lmcache.integration.vllm.vllm_v1_adapter import (
|
||||
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
|
||||
)
|
||||
|
||||
cls = LMCacheConnectorLatestImpl
|
||||
|
||||
self._lmcache_engine = cls(vllm_config, role, self)
|
||||
|
||||
self._kv_cache_events: LMCacheKVEvents | None = None
|
||||
|
||||
# ==============================
|
||||
# Worker-side methods
|
||||
# ==============================
|
||||
def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
|
||||
"""
|
||||
Initialize with the KV caches. Useful for pre-registering the
|
||||
KV Caches in the KVConnector (e.g. for NIXL).
|
||||
|
||||
Args:
|
||||
kv_caches: dictionary of layer names, kv cache
|
||||
"""
|
||||
if hasattr(self._lmcache_engine, "register_kv_caches"):
|
||||
self._lmcache_engine.register_kv_caches(kv_caches)
|
||||
else:
|
||||
logger.warning(
|
||||
"LMCache engine does not support register_kv_caches, "
|
||||
"please check and use the latest version"
|
||||
)
|
||||
|
||||
def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
|
||||
"""
|
||||
Start loading the KV cache from the connector to vLLM's paged
|
||||
KV buffer. This is called from the forward context before the
|
||||
forward pass to enable async loading during model execution.
|
||||
|
||||
Args:
|
||||
forward_context (ForwardContext): the forward context.
|
||||
**kwargs: additional arguments for the load operation
|
||||
|
||||
Note:
|
||||
The number of elements in kv_caches and layer_names should be
|
||||
the same.
|
||||
|
||||
"""
|
||||
self._lmcache_engine.start_load_kv(forward_context, **kwargs)
|
||||
|
||||
def wait_for_layer_load(self, layer_name: str) -> None:
|
||||
"""
|
||||
Block until the KV for a specific layer is loaded into vLLM's
|
||||
paged buffer. This is called from within attention layer to ensure
|
||||
async copying from start_load_kv is complete.
|
||||
|
||||
This interface will be useful for layer-by-layer pipelining.
|
||||
|
||||
Args:
|
||||
layer_name: the name of that layer
|
||||
"""
|
||||
self._lmcache_engine.wait_for_layer_load(layer_name)
|
||||
|
||||
def save_kv_layer(
|
||||
self,
|
||||
layer_name: str,
|
||||
kv_layer: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
Start saving the a layer of KV cache from vLLM's paged buffer
|
||||
to the connector. This is called from within attention layer to
|
||||
enable async copying during execution.
|
||||
|
||||
Args:
|
||||
layer_name (str): the name of the layer.
|
||||
kv_layer (torch.Tensor): the paged KV buffer of the current
|
||||
layer in vLLM.
|
||||
attn_metadata (AttentionMetadata): the attention metadata.
|
||||
**kwargs: additional arguments for the save operation.
|
||||
"""
|
||||
self._lmcache_engine.save_kv_layer(
|
||||
layer_name, kv_layer, attn_metadata, **kwargs
|
||||
)
|
||||
|
||||
def wait_for_save(self):
|
||||
"""
|
||||
Block until all the save operations is done. This is called
|
||||
as the forward context exits to ensure that the async saving
|
||||
from save_kv_layer is complete before finishing the forward.
|
||||
|
||||
This prevents overwrites of paged KV buffer before saving done.
|
||||
"""
|
||||
self._lmcache_engine.wait_for_save()
|
||||
|
||||
def get_finished(
|
||||
self, finished_req_ids: set[str]
|
||||
) -> tuple[set[str] | None, set[str] | None]:
|
||||
"""
|
||||
Notifies worker-side connector ids of requests that have
|
||||
finished generating tokens.
|
||||
|
||||
Returns:
|
||||
ids of requests that have finished asynchronous transfer
|
||||
(requests that previously returned True from request_finished()),
|
||||
tuple of (sending/saving ids, recving/loading ids).
|
||||
The finished saves/sends req ids must belong to a set provided in a
|
||||
call to this method (this call or a prior one).
|
||||
"""
|
||||
return self._lmcache_engine.get_finished(finished_req_ids)
|
||||
|
||||
def get_block_ids_with_load_errors(self) -> set[int]:
|
||||
"""
|
||||
Get the set of block IDs that failed to load.
|
||||
|
||||
Returns:
|
||||
Set of block IDs that encountered load errors.
|
||||
Empty set if no load errors occurred.
|
||||
"""
|
||||
method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
|
||||
if callable(method):
|
||||
return method()
|
||||
|
||||
# Fallback for older versions that don't support this method
|
||||
return set()
|
||||
|
||||
def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
|
||||
"""
|
||||
Get the KV connector kv cache events collected during the last interval.
|
||||
"""
|
||||
|
||||
events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined]
|
||||
if not events:
|
||||
return None
|
||||
|
||||
blocks: list[BlockStored] = [
|
||||
BlockStored(
|
||||
block_hashes=e.block_hashes,
|
||||
parent_block_hash=e.parent_block_hash,
|
||||
token_ids=e.token_ids,
|
||||
lora_id=e.lora_id,
|
||||
block_size=e.block_size,
|
||||
medium=e.medium,
|
||||
lora_name=getattr(e, "lora_name", None),
|
||||
)
|
||||
for e in events
|
||||
]
|
||||
|
||||
lmcache_kv_events = LMCacheKVEvents(num_workers=1)
|
||||
lmcache_kv_events.add_events(blocks)
|
||||
return lmcache_kv_events
|
||||
|
||||
# ==============================
|
||||
# Scheduler-side methods
|
||||
# ==============================
|
||||
def get_num_new_matched_tokens(
|
||||
self,
|
||||
request: "Request",
|
||||
num_computed_tokens: int,
|
||||
) -> tuple[int | None, bool]:
|
||||
"""
|
||||
Get number of new tokens that can be loaded from the
|
||||
external KV cache beyond the num_computed_tokens.
|
||||
|
||||
Args:
|
||||
request (Request): the request object.
|
||||
num_computed_tokens (int): the number of locally
|
||||
computed tokens for this request
|
||||
|
||||
Returns:
|
||||
the number of tokens that can be loaded from the
|
||||
external KV cache beyond what is already computed.
|
||||
"""
|
||||
return self._lmcache_engine.get_num_new_matched_tokens(
|
||||
request, num_computed_tokens
|
||||
), False
|
||||
|
||||
def update_state_after_alloc(
|
||||
self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
|
||||
):
|
||||
"""
|
||||
Update KVConnector state after block allocation.
|
||||
"""
|
||||
self._lmcache_engine.update_state_after_alloc(request, num_external_tokens)
|
||||
|
||||
def build_connector_meta(
|
||||
self, scheduler_output: SchedulerOutput
|
||||
) -> KVConnectorMetadata:
|
||||
"""
|
||||
Build the connector metadata for this step.
|
||||
|
||||
This function should NOT modify fields in the scheduler_output.
|
||||
Also, calling this function will reset the state of the connector.
|
||||
|
||||
Args:
|
||||
scheduler_output (SchedulerOutput): the scheduler output object.
|
||||
"""
|
||||
return self._lmcache_engine.build_connector_meta(scheduler_output)
|
||||
|
||||
def update_connector_output(self, connector_output: KVConnectorOutput):
|
||||
"""
|
||||
Update KVConnector state from worker-side connectors output.
|
||||
|
||||
Args:
|
||||
connector_output (KVConnectorOutput): the worker-side
|
||||
connectors output.
|
||||
"""
|
||||
# Get the KV events
|
||||
kv_cache_events = connector_output.kv_cache_events
|
||||
if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
|
||||
return
|
||||
|
||||
if self._kv_cache_events is None:
|
||||
self._kv_cache_events = kv_cache_events
|
||||
else:
|
||||
self._kv_cache_events.add_events(kv_cache_events.get_all_events())
|
||||
self._kv_cache_events.increment_workers(
|
||||
kv_cache_events.get_number_of_workers()
|
||||
)
|
||||
return
|
||||
|
||||
def request_finished(
|
||||
self,
|
||||
request: "Request",
|
||||
block_ids: list[int],
|
||||
) -> tuple[bool, dict[str, Any] | None]:
|
||||
"""
|
||||
Called when a request has finished, before its blocks are freed.
|
||||
|
||||
Returns:
|
||||
True if the request is being saved/sent asynchronously and blocks
|
||||
should not be freed until the request_id is returned from
|
||||
get_finished().
|
||||
Optional KVTransferParams to be included in the request outputs
|
||||
returned by the engine.
|
||||
"""
|
||||
return self._lmcache_engine.request_finished(request, block_ids)
|
||||
|
||||
def request_finished_all_groups(
|
||||
self,
|
||||
request: "Request",
|
||||
block_ids: tuple[list[int], ...],
|
||||
) -> tuple[bool, dict[str, Any] | None]:
|
||||
"""
|
||||
Called exactly once when a request has finished for all KV cache
|
||||
groups (HMA support for hybrid Mamba/Attention models).
|
||||
|
||||
LMCache only stores/offloads attention KV cache blocks, so we
|
||||
extract the first group's block IDs and delegate to the
|
||||
single-group request_finished.
|
||||
|
||||
Args:
|
||||
request: the request object.
|
||||
block_ids: tuple of block ID lists, one per KV cache group.
|
||||
|
||||
Returns:
|
||||
Same as request_finished.
|
||||
"""
|
||||
# LMCache only handles attention (first) group blocks.
|
||||
# Mamba SSM state is managed separately by the scheduler.
|
||||
return self.request_finished(request, block_ids[0])
|
||||
|
||||
def take_events(self) -> Iterable["KVCacheEvent"]:
|
||||
"""
|
||||
Take the KV cache events from the connector.
|
||||
|
||||
Yields:
|
||||
New KV cache events since the last call.
|
||||
"""
|
||||
if self._kv_cache_events is not None:
|
||||
self._kv_cache_events.aggregate()
|
||||
kv_cache_events = self._kv_cache_events.get_all_events()
|
||||
yield from kv_cache_events
|
||||
self._kv_cache_events.clear_events()
|
||||
self._kv_cache_events = None
|
||||
Reference in New Issue
Block a user