monkey patch the monkey pathing vllm nonsense
This commit is contained in:
@@ -20,4 +20,10 @@ RUN apt-get update && apt-get install -y git \
|
||||
COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py
|
||||
|
||||
# Monkey patch more vllm stuff - https://github.com/vllm-project/vllm/pull/38237/changes#diff-bee6813076031d3ca1edc903c1b02b81e4676519afc562ce3fefe37f20c7b650
|
||||
RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
|
||||
RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
|
||||
|
||||
# Patch LMCacheConnectorV1 to support HMA (Hybrid Mamba/Attention KV cache manager)
|
||||
# This is required for hybrid models like Nemotron that use both Mamba and Attention layers.
|
||||
# Without this patch, LMCacheConnectorV1 fails with:
|
||||
# "Connector LMCacheConnectorV1 does not support HMA but HMA is enabled"
|
||||
COPY ./lmcache_connector.py /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
|
||||
337
THIS_IS_MY_POD_SUMMARY_VINNY.md
Normal file
337
THIS_IS_MY_POD_SUMMARY_VINNY.md
Normal file
@@ -0,0 +1,337 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
annotations:
|
||||
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
|
||||
cni.projectcalico.org/podIP: 10.244.248.111/32
|
||||
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
|
||||
k8s.v1.cni.cncf.io/network-status: |-
|
||||
[{
|
||||
"name": "k8s-pod-network",
|
||||
"ips": [
|
||||
"10.244.248.111",
|
||||
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||||
],
|
||||
"default": true,
|
||||
"dns": {}
|
||||
},{
|
||||
"name": "vllm/ipoib-network-vllm",
|
||||
"interface": "net1",
|
||||
"ips": [
|
||||
"10.66.0.6"
|
||||
],
|
||||
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||||
"dns": {}
|
||||
}]
|
||||
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
|
||||
k8s.v1.cni.cncf.io/networks-status: |-
|
||||
[{
|
||||
"name": "k8s-pod-network",
|
||||
"ips": [
|
||||
"10.244.248.111",
|
||||
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||||
],
|
||||
"default": true,
|
||||
"dns": {}
|
||||
},{
|
||||
"name": "vllm/ipoib-network-vllm",
|
||||
"interface": "net1",
|
||||
"ips": [
|
||||
"10.66.0.6"
|
||||
],
|
||||
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||||
"dns": {}
|
||||
}]
|
||||
creationTimestamp: '2026-04-15T22:38:27Z'
|
||||
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
|
||||
generation: 1
|
||||
labels:
|
||||
app.kubernetes.io/component: serving-engine
|
||||
app.kubernetes.io/instance: production-stack-sea-inference
|
||||
app.kubernetes.io/managed-by: helm
|
||||
app.kubernetes.io/name: nemotron-3-super
|
||||
app.kubernetes.io/part-of: vllm-stack
|
||||
environment: test
|
||||
helm-release-name: production-stack-sea-inference
|
||||
model: nemotron-3-super
|
||||
pod-template-hash: 856dc7d695
|
||||
release: test
|
||||
topology.kubernetes.io/region: sea
|
||||
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
|
||||
namespace: vllm
|
||||
ownerReferences:
|
||||
- apiVersion: apps/v1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: ReplicaSet
|
||||
name: >-
|
||||
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
|
||||
uid: 88c04723-f29b-432a-8318-21a9d389cac4
|
||||
resourceVersion: '29767269'
|
||||
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- vllm
|
||||
- serve
|
||||
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||||
- '--host'
|
||||
- 0.0.0.0
|
||||
- '--port'
|
||||
- '8000'
|
||||
- '--no-enable-prefix-caching'
|
||||
- '--tensor-parallel-size'
|
||||
- '8'
|
||||
- '--async-scheduling'
|
||||
- '--dtype=auto'
|
||||
- '--attention-backend=TRITON_ATTN'
|
||||
- '--gpu_memory_utilization=0.96'
|
||||
- '--enable-auto-tool-choice'
|
||||
- '--tool-call-parser=qwen3_coder'
|
||||
- '--trust_remote_code'
|
||||
- '--max-cudagraph-capture-size=128'
|
||||
- '--enable-chunked-prefill'
|
||||
- '--mamba-ssm-cache-dtype=float16'
|
||||
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
|
||||
- '--reasoning-parser=super_v3'
|
||||
- '--max-model-len=1048576'
|
||||
- '--disable-custom-all-reduce'
|
||||
- '--no-disable-hybrid-kv-cache-manager'
|
||||
- '--enforce-eager'
|
||||
- '--kv-transfer-config'
|
||||
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
|
||||
env:
|
||||
- name: PYTHONHASHSEED
|
||||
value: '123'
|
||||
- name: HF_HOME
|
||||
value: /tmp
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: status.podIP
|
||||
- name: PROMETHEUS_MULTIPROC_DIR
|
||||
value: /tmp
|
||||
- name: OMP_NUM_THREADS
|
||||
value: '32'
|
||||
- name: HF_TOKEN
|
||||
value: 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
|
||||
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
|
||||
value: '1'
|
||||
- name: NCCL_TOPO_FILE
|
||||
value: /etc/nccl/virtualTopology.xml
|
||||
- name: PYTORCH_CUDA_ALLOC_CONF
|
||||
value: expandable_segments:True
|
||||
- name: LMCACHE_REMOTE_URL
|
||||
value: redis://10.66.0.100:6379
|
||||
- name: LMCACHE_REMOTE_SERDE
|
||||
value: naive
|
||||
- name: LMCACHE_USE_EXPERIMENTAL
|
||||
value: 'True'
|
||||
- name: VLLM_RPC_TIMEOUT
|
||||
value: '1000000'
|
||||
- name: LMCACHE_LOG_LEVEL
|
||||
value: ERROR
|
||||
- name: LMCACHE_LOCAL_CPU
|
||||
value: 'True'
|
||||
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
|
||||
value: '512'
|
||||
- name: LMCACHE_LMCACHE_INSTANCE_ID
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.name
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||||
imagePullPolicy: Always
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: vllm
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: container-port
|
||||
protocol: TCP
|
||||
- containerPort: 55555
|
||||
name: zmq-port
|
||||
protocol: TCP
|
||||
- containerPort: 9999
|
||||
name: ucx-port
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
requests:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
securityContext:
|
||||
runAsNonRoot: false
|
||||
startupProbe:
|
||||
failureThreshold: 120
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 60
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: shm
|
||||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||||
name: kube-api-access-dlhrd
|
||||
readOnly: true
|
||||
dnsPolicy: ClusterFirst
|
||||
enableServiceLinks: true
|
||||
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
|
||||
nodeName: b200-nodepool-d51376abbf32
|
||||
preemptionPolicy: PreemptLowerPriority
|
||||
priority: 0
|
||||
restartPolicy: Always
|
||||
schedulerName: default-scheduler
|
||||
securityContext: {}
|
||||
serviceAccount: default
|
||||
serviceAccountName: default
|
||||
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
|
||||
terminationGracePeriodSeconds: 30
|
||||
tolerations:
|
||||
- effect: NoExecute
|
||||
key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
tolerationSeconds: 300
|
||||
- effect: NoExecute
|
||||
key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
tolerationSeconds: 300
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 64Gi
|
||||
name: shm
|
||||
- name: kube-api-access-dlhrd
|
||||
projected:
|
||||
defaultMode: 420
|
||||
sources:
|
||||
- serviceAccountToken:
|
||||
expirationSeconds: 3607
|
||||
path: token
|
||||
- configMap:
|
||||
items:
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
name: kube-root-ca.crt
|
||||
- downwardAPI:
|
||||
items:
|
||||
- fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.namespace
|
||||
path: namespace
|
||||
status:
|
||||
conditions:
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:32Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: PodReadyToStartContainers
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: Initialized
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
message: 'containers with unready status: [vllm]'
|
||||
observedGeneration: 1
|
||||
reason: ContainersNotReady
|
||||
status: 'False'
|
||||
type: Ready
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
message: 'containers with unready status: [vllm]'
|
||||
observedGeneration: 1
|
||||
reason: ContainersNotReady
|
||||
status: 'False'
|
||||
type: ContainersReady
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: PodScheduled
|
||||
containerStatuses:
|
||||
- allocatedResources:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
containerID: >-
|
||||
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||||
imageID: >-
|
||||
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
|
||||
lastState:
|
||||
terminated:
|
||||
containerID: >-
|
||||
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
|
||||
exitCode: 1
|
||||
finishedAt: '2026-04-15T22:42:20Z'
|
||||
reason: Error
|
||||
startedAt: '2026-04-15T22:38:31Z'
|
||||
name: vllm
|
||||
ready: false
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
requests:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
restartCount: 1
|
||||
started: false
|
||||
state:
|
||||
running:
|
||||
startedAt: '2026-04-15T22:42:24Z'
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: shm
|
||||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||||
name: kube-api-access-dlhrd
|
||||
readOnly: true
|
||||
recursiveReadOnly: Disabled
|
||||
hostIP: 10.4.96.13
|
||||
hostIPs:
|
||||
- ip: 10.4.96.13
|
||||
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
|
||||
observedGeneration: 1
|
||||
phase: Running
|
||||
podIP: 10.244.248.111
|
||||
podIPs:
|
||||
- ip: 10.244.248.111
|
||||
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
|
||||
qosClass: Burstable
|
||||
startTime: '2026-04-15T22:38:27Z'
|
||||
545
THIS_IS_THE_ERROR_VINNY.md
Normal file
545
THIS_IS_THE_ERROR_VINNY.md
Normal file
@@ -0,0 +1,545 @@
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] █ █ █▄ ▄█
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] █▄█▀ █ █ █ █ model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]
|
||||
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:233] non-default args: {'model_tag': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'host': '0.0.0.0', 'model': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'trust_remote_code': True, 'max_model_len': 1048576, 'enforce_eager': True, 'attention_backend': 'TRITON_ATTN', 'reasoning_parser': 'super_v3', 'reasoning_parser_plugin': '/opt/super_v3_reasoning_parser.py', 'tensor_parallel_size': 8, 'disable_custom_all_reduce': True, 'gpu_memory_utilization': 0.96, 'enable_prefix_caching': False, 'mamba_ssm_cache_dtype': 'float16', 'enable_chunked_prefill': True, 'disable_hybrid_kv_cache_manager': False, 'async_scheduling': True, 'max_cudagraph_capture_size': 128, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='dea40998-1518-4361-a31f-884d3c1c1e74', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='fail')}
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_ADDR
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PORT
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PROTO
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PORT
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_HTTP_MONITORING
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_LISTENER_80
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_HOST
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PROTO
|
||||
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_ADDR
|
||||
(APIServer pid=1) A new version of the following files was downloaded from https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4:
|
||||
(APIServer pid=1) - configuration_nemotron_h.py
|
||||
(APIServer pid=1) . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [model.py:549] Resolved architecture: NemotronHForCausalLM
|
||||
(APIServer pid=1) WARNING 04-15 22:38:46 [model.py:2176] User-specified max_model_len (1048576) is greater than the derived max_model_len (max_position_embeddings=262144.0 or model_max_length=None in model's config.json). VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme caution. If the model uses relative position encoding (RoPE), positions exceeding derived_max_model_len lead to nan. If the model uses absolute position encoding, positions exceeding derived_max_model_len will cause a CUDA array out-of-bounds error.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [model.py:1678] Using max model len 1048576
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [config.py:281] Setting attention block size to 1056 tokens to ensure that attention page size is >= mamba page size.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [config.py:312] Padding mamba page size by 0.19% to ensure that mamba page size and attention page size are exactly equal.
|
||||
(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:381] Detected ModelOpt fp8 checkpoint (quant_algo=FP8). Please note that the format is experimental and could change.
|
||||
(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:998] Detected ModelOpt NVFP4 checkpoint. Please note that the format is experimental and could change in future.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:790] Asynchronous scheduling is enabled.
|
||||
(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none
|
||||
(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.
|
||||
(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:1025] Cudagraph is disabled under eager mode
|
||||
(APIServer pid=1) INFO 04-15 22:38:51 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant, allreduce_rms
|
||||
(EngineCore pid=277) INFO 04-15 22:38:58 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', speculative_config=None, tokenizer='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1048576, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=True, quantization=modelopt_mixed, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=fp8_e4m3, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='super_v3', reasoning_parser_plugin='/opt/super_v3_reasoning_parser.py', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
|
||||
(EngineCore pid=277) INFO 04-15 22:38:58 [multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.244.248.111 (local), world_size=8, local_world_size=8
|
||||
(Worker pid=348) INFO 04-15 22:39:03 [parallel_state.py:1400] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=415) INFO 04-15 22:39:07 [parallel_state.py:1400] world_size=8 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=483) INFO 04-15 22:39:11 [parallel_state.py:1400] world_size=8 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=556) INFO 04-15 22:39:15 [parallel_state.py:1400] world_size=8 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=629) INFO 04-15 22:39:19 [parallel_state.py:1400] world_size=8 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=702) INFO 04-15 22:39:23 [parallel_state.py:1400] world_size=8 rank=5 local_rank=5 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=775) INFO 04-15 22:39:27 [parallel_state.py:1400] world_size=8 rank=6 local_rank=6 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=848) INFO 04-15 22:39:31 [parallel_state.py:1400] world_size=8 rank=7 local_rank=7 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
|
||||
(Worker pid=348) INFO 04-15 22:39:31 [pynccl.py:111] vLLM is using nccl==2.28.9
|
||||
(Worker pid=348) INFO 04-15 22:39:36 [parallel_state.py:1716] rank 0 in world size 8 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:39:37 [gpu_model_runner.py:4735] Starting to load model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4...
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [__init__.py:261] Selected FlashInferFP8ScaledMMLinearKernel for ModelOptFp8LinearMethod
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform.
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4_utils.py:85] Using NvFp4LinearBackend.FLASHINFER_CUTLASS for NVFP4 GEMM
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4.py:256] Using 'FLASHINFER_TRTLLM' NvFp4 MoE backend out of potential backends: ['FLASHINFER_TRTLLM', 'FLASHINFER_CUTEDSL', 'FLASHINFER_CUTLASS', 'VLLM_CUTLASS', 'MARLIN'].
|
||||
(Worker_TP1 pid=415) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP4 pid=629) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP2 pid=483) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP6 pid=775) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP7 pid=848) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP3 pid=556) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP5 pid=702) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
|
||||
(Worker_TP1 pid=415) INFO 04-15 22:41:36 [weight_utils.py:581] Time spent downloading weights for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4: 116.048954 seconds
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 0% Completed | 0/17 [00:00<?, ?it/s]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 6% Completed | 1/17 [00:01<00:20, 1.29s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 12% Completed | 2/17 [00:03<00:23, 1.60s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 18% Completed | 3/17 [00:04<00:21, 1.52s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 24% Completed | 4/17 [00:06<00:20, 1.58s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 29% Completed | 5/17 [00:07<00:17, 1.49s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 35% Completed | 6/17 [00:08<00:16, 1.47s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 41% Completed | 7/17 [00:10<00:14, 1.42s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 47% Completed | 8/17 [00:11<00:12, 1.39s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 53% Completed | 9/17 [00:13<00:11, 1.40s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 59% Completed | 10/17 [00:14<00:10, 1.47s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 65% Completed | 11/17 [00:16<00:08, 1.45s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 71% Completed | 12/17 [00:17<00:07, 1.44s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 76% Completed | 13/17 [00:18<00:05, 1.43s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 82% Completed | 14/17 [00:20<00:04, 1.43s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 88% Completed | 15/17 [00:21<00:02, 1.30s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
Loading safetensors checkpoint shards: 100% Completed | 17/17 [00:21<00:00, 1.26s/it]
|
||||
(Worker_TP0 pid=348)
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:41:59 [default_loader.py:384] Loading weights took 21.38 seconds
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:41:59 [flashinfer_utils.py:238] Padding intermediate size from 336 to 384 for up/down projection weights.
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:41:59 [nvfp4.py:401] Using MoEPrepareAndFinalizeNoDPEPMonolithic
|
||||
(Worker_TP0 pid=348) WARNING 04-15 22:41:59 [kv_cache.py:94] Checkpoint does not provide a q scaling factor. Setting it to k_scale. This only matters for FP8 Attention backends (flash-attn or flashinfer).
|
||||
(Worker_TP0 pid=348) WARNING 04-15 22:41:59 [kv_cache.py:108] Using KV cache scaling factor 1.0 for fp8_e4m3. If this is unintended, verify that k/v_scale scaling factors are properly set in the checkpoint.
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:42:01 [gpu_model_runner.py:4820] Model loading took 10.4 GiB memory and 142.225157 seconds
|
||||
(Worker_TP0 pid=348) INFO 04-15 22:42:10 [gpu_worker.py:436] Available KV cache memory: 158.16 GiB
|
||||
(EngineCore pid=277) INFO 04-15 22:42:11 [kv_cache_utils.py:1319] GPU KV cache size: 13,819,872 tokens
|
||||
(EngineCore pid=277) INFO 04-15 22:42:11 [kv_cache_utils.py:1324] Maximum concurrency for 1,048,576 tokens per request: 78.68x
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
|
||||
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] EngineCore failed to start.
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] Traceback (most recent call last):
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] return func(*args, **kwargs)
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] super().__init__(
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] kv_cache_config = self._initialize_kv_caches(vllm_config)
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] return func(*args, **kwargs)
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 280, in _initialize_kv_caches
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] self.model_executor.initialize_from_config(kv_cache_configs)
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] return aggregate(get_response())
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] raise RuntimeError(
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] RuntimeError: Worker failed with error 'Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.', please check the stack trace above for the root cause
|
||||
(Worker_TP3 pid=556) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP0 pid=348) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP6 pid=775) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP7 pid=848) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP4 pid=629) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP2 pid=483) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP1 pid=415) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(Worker_TP5 pid=702) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
|
||||
(EngineCore pid=277) ERROR 04-15 22:42:14 [multiproc_executor.py:273] Worker proc VllmWorker-3 died unexpectedly, shutting down executor.
|
||||
(EngineCore pid=277) Process EngineCore:
|
||||
(EngineCore pid=277) Traceback (most recent call last):
|
||||
(EngineCore pid=277) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
|
||||
(EngineCore pid=277) self.run()
|
||||
(EngineCore pid=277) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
|
||||
(EngineCore pid=277) self._target(*self._args, **self._kwargs)
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core
|
||||
(EngineCore pid=277) raise e
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
|
||||
(EngineCore pid=277) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
|
||||
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(EngineCore pid=277) return func(*args, **kwargs)
|
||||
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
|
||||
(EngineCore pid=277) super().__init__(
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
|
||||
(EngineCore pid=277) kv_cache_config = self._initialize_kv_caches(vllm_config)
|
||||
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(EngineCore pid=277) return func(*args, **kwargs)
|
||||
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 280, in _initialize_kv_caches
|
||||
(EngineCore pid=277) self.model_executor.initialize_from_config(kv_cache_configs)
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
|
||||
(EngineCore pid=277) self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
|
||||
(EngineCore pid=277) return aggregate(get_response())
|
||||
(EngineCore pid=277) ^^^^^^^^^^^^^^
|
||||
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
|
||||
(EngineCore pid=277) raise RuntimeError(
|
||||
(EngineCore pid=277) RuntimeError: Worker failed with error 'Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.', please check the stack trace above for the root cause
|
||||
(APIServer pid=1) Traceback (most recent call last):
|
||||
(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in <module>
|
||||
(APIServer pid=1) sys.exit(main())
|
||||
(APIServer pid=1) ^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main
|
||||
(APIServer pid=1) args.dispatch_function(args)
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd
|
||||
(APIServer pid=1) uvloop.run(run_server(args))
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
|
||||
(APIServer pid=1) return __asyncio.run(
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
|
||||
(APIServer pid=1) return runner.run(main)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
|
||||
(APIServer pid=1) return self._loop.run_until_complete(task)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
|
||||
(APIServer pid=1) return await main
|
||||
(APIServer pid=1) ^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server
|
||||
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker
|
||||
(APIServer pid=1) async with build_async_engine_client(
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
|
||||
(APIServer pid=1) return await anext(self.gen)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client
|
||||
(APIServer pid=1) async with build_async_engine_client_from_engine_args(
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
|
||||
(APIServer pid=1) return await anext(self.gen)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args
|
||||
(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config(
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config
|
||||
(APIServer pid=1) return cls(
|
||||
(APIServer pid=1) ^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__
|
||||
(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client(
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(APIServer pid=1) return func(*args, **kwargs)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client
|
||||
(APIServer pid=1) return AsyncMPClient(*client_args)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
|
||||
(APIServer pid=1) return func(*args, **kwargs)
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__
|
||||
(APIServer pid=1) super().__init__(
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__
|
||||
(APIServer pid=1) with launch_core_engines(
|
||||
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^
|
||||
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
|
||||
(APIServer pid=1) next(self.gen)
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines
|
||||
(APIServer pid=1) wait_for_engine_startup(
|
||||
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup
|
||||
(APIServer pid=1) raise RuntimeError(
|
||||
(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
|
||||
/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 8 leaked shared_memory objects to clean up at shutdown
|
||||
warnings.warn('resource_tracker: There appear to be %d '
|
||||
379
lmcache_connector.py
Normal file
379
lmcache_connector.py
Normal file
@@ -0,0 +1,379 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_events import (
|
||||
BlockStored,
|
||||
KVCacheEvent,
|
||||
KVConnectorKVEvents,
|
||||
KVEventAggregator,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1,
|
||||
KVConnectorMetadata,
|
||||
KVConnectorRole,
|
||||
SupportsHMA,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.attention.backend import AttentionMetadata
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import KVConnectorOutput
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import Request
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class LMCacheKVEvents(KVConnectorKVEvents):
|
||||
"""
|
||||
Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
|
||||
"""
|
||||
|
||||
def __init__(self, num_workers: int) -> None:
|
||||
self._aggregator = KVEventAggregator(num_workers)
|
||||
|
||||
def add_events(self, events: list[KVCacheEvent]) -> None:
|
||||
self._aggregator.add_events(events)
|
||||
|
||||
def aggregate(self) -> "LMCacheKVEvents":
|
||||
"""
|
||||
Aggregate KV events and retain only common events.
|
||||
"""
|
||||
common_events = self._aggregator.get_common_events()
|
||||
self._aggregator.clear_events()
|
||||
self._aggregator.add_events(common_events)
|
||||
self._aggregator.reset_workers()
|
||||
return self
|
||||
|
||||
def increment_workers(self, count: int = 1) -> None:
|
||||
self._aggregator.increment_workers(count)
|
||||
|
||||
def get_all_events(self) -> list[KVCacheEvent]:
|
||||
return self._aggregator.get_all_events()
|
||||
|
||||
def get_number_of_workers(self) -> int:
|
||||
return self._aggregator.get_number_of_workers()
|
||||
|
||||
def clear_events(self) -> None:
|
||||
self._aggregator.clear_events()
|
||||
self._aggregator.reset_workers()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<LMCacheKVEvents events={self.get_all_events()}>"
|
||||
|
||||
|
||||
class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
|
||||
@classmethod
|
||||
def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
|
||||
"""
|
||||
LMCache requires PIECEWISE CUDA graph mode when layerwise
|
||||
operations are enabled. The wait_for_layer_load and save_kv_layer
|
||||
methods perform actual async synchronization that cannot be
|
||||
captured in CUDA graphs.
|
||||
"""
|
||||
return extra_config.get("use_layerwise", False)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: "VllmConfig",
|
||||
role: KVConnectorRole,
|
||||
kv_cache_config: "KVCacheConfig",
|
||||
):
|
||||
super().__init__(
|
||||
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
|
||||
)
|
||||
assert vllm_config.kv_transfer_config is not None
|
||||
use_native = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
"use_native", False
|
||||
)
|
||||
if use_native:
|
||||
logger.info("Initializing native LMCache connector")
|
||||
# lazy import
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration
|
||||
|
||||
_adapter = lmcache_integration.vllm_v1_adapter
|
||||
|
||||
cls = _adapter.LMCacheConnectorV1Impl
|
||||
else:
|
||||
logger.info("Initializing latest dev LMCache connector")
|
||||
# lazy import
|
||||
from lmcache.integration.vllm.vllm_v1_adapter import (
|
||||
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
|
||||
)
|
||||
|
||||
cls = LMCacheConnectorLatestImpl
|
||||
|
||||
self._lmcache_engine = cls(vllm_config, role, self)
|
||||
|
||||
self._kv_cache_events: LMCacheKVEvents | None = None
|
||||
|
||||
# ==============================
|
||||
# Worker-side methods
|
||||
# ==============================
|
||||
def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
|
||||
"""
|
||||
Initialize with the KV caches. Useful for pre-registering the
|
||||
KV Caches in the KVConnector (e.g. for NIXL).
|
||||
|
||||
Args:
|
||||
kv_caches: dictionary of layer names, kv cache
|
||||
"""
|
||||
if hasattr(self._lmcache_engine, "register_kv_caches"):
|
||||
self._lmcache_engine.register_kv_caches(kv_caches)
|
||||
else:
|
||||
logger.warning(
|
||||
"LMCache engine does not support register_kv_caches, "
|
||||
"please check and use the latest version"
|
||||
)
|
||||
|
||||
def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
|
||||
"""
|
||||
Start loading the KV cache from the connector to vLLM's paged
|
||||
KV buffer. This is called from the forward context before the
|
||||
forward pass to enable async loading during model execution.
|
||||
|
||||
Args:
|
||||
forward_context (ForwardContext): the forward context.
|
||||
**kwargs: additional arguments for the load operation
|
||||
|
||||
Note:
|
||||
The number of elements in kv_caches and layer_names should be
|
||||
the same.
|
||||
|
||||
"""
|
||||
self._lmcache_engine.start_load_kv(forward_context, **kwargs)
|
||||
|
||||
def wait_for_layer_load(self, layer_name: str) -> None:
|
||||
"""
|
||||
Block until the KV for a specific layer is loaded into vLLM's
|
||||
paged buffer. This is called from within attention layer to ensure
|
||||
async copying from start_load_kv is complete.
|
||||
|
||||
This interface will be useful for layer-by-layer pipelining.
|
||||
|
||||
Args:
|
||||
layer_name: the name of that layer
|
||||
"""
|
||||
self._lmcache_engine.wait_for_layer_load(layer_name)
|
||||
|
||||
def save_kv_layer(
|
||||
self,
|
||||
layer_name: str,
|
||||
kv_layer: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
Start saving the a layer of KV cache from vLLM's paged buffer
|
||||
to the connector. This is called from within attention layer to
|
||||
enable async copying during execution.
|
||||
|
||||
Args:
|
||||
layer_name (str): the name of the layer.
|
||||
kv_layer (torch.Tensor): the paged KV buffer of the current
|
||||
layer in vLLM.
|
||||
attn_metadata (AttentionMetadata): the attention metadata.
|
||||
**kwargs: additional arguments for the save operation.
|
||||
"""
|
||||
self._lmcache_engine.save_kv_layer(
|
||||
layer_name, kv_layer, attn_metadata, **kwargs
|
||||
)
|
||||
|
||||
def wait_for_save(self):
|
||||
"""
|
||||
Block until all the save operations is done. This is called
|
||||
as the forward context exits to ensure that the async saving
|
||||
from save_kv_layer is complete before finishing the forward.
|
||||
|
||||
This prevents overwrites of paged KV buffer before saving done.
|
||||
"""
|
||||
self._lmcache_engine.wait_for_save()
|
||||
|
||||
def get_finished(
|
||||
self, finished_req_ids: set[str]
|
||||
) -> tuple[set[str] | None, set[str] | None]:
|
||||
"""
|
||||
Notifies worker-side connector ids of requests that have
|
||||
finished generating tokens.
|
||||
|
||||
Returns:
|
||||
ids of requests that have finished asynchronous transfer
|
||||
(requests that previously returned True from request_finished()),
|
||||
tuple of (sending/saving ids, recving/loading ids).
|
||||
The finished saves/sends req ids must belong to a set provided in a
|
||||
call to this method (this call or a prior one).
|
||||
"""
|
||||
return self._lmcache_engine.get_finished(finished_req_ids)
|
||||
|
||||
def get_block_ids_with_load_errors(self) -> set[int]:
|
||||
"""
|
||||
Get the set of block IDs that failed to load.
|
||||
|
||||
Returns:
|
||||
Set of block IDs that encountered load errors.
|
||||
Empty set if no load errors occurred.
|
||||
"""
|
||||
method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
|
||||
if callable(method):
|
||||
return method()
|
||||
|
||||
# Fallback for older versions that don't support this method
|
||||
return set()
|
||||
|
||||
def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
|
||||
"""
|
||||
Get the KV connector kv cache events collected during the last interval.
|
||||
"""
|
||||
|
||||
events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined]
|
||||
if not events:
|
||||
return None
|
||||
|
||||
blocks: list[BlockStored] = [
|
||||
BlockStored(
|
||||
block_hashes=e.block_hashes,
|
||||
parent_block_hash=e.parent_block_hash,
|
||||
token_ids=e.token_ids,
|
||||
lora_id=e.lora_id,
|
||||
block_size=e.block_size,
|
||||
medium=e.medium,
|
||||
lora_name=getattr(e, "lora_name", None),
|
||||
)
|
||||
for e in events
|
||||
]
|
||||
|
||||
lmcache_kv_events = LMCacheKVEvents(num_workers=1)
|
||||
lmcache_kv_events.add_events(blocks)
|
||||
return lmcache_kv_events
|
||||
|
||||
# ==============================
|
||||
# Scheduler-side methods
|
||||
# ==============================
|
||||
def get_num_new_matched_tokens(
|
||||
self,
|
||||
request: "Request",
|
||||
num_computed_tokens: int,
|
||||
) -> tuple[int | None, bool]:
|
||||
"""
|
||||
Get number of new tokens that can be loaded from the
|
||||
external KV cache beyond the num_computed_tokens.
|
||||
|
||||
Args:
|
||||
request (Request): the request object.
|
||||
num_computed_tokens (int): the number of locally
|
||||
computed tokens for this request
|
||||
|
||||
Returns:
|
||||
the number of tokens that can be loaded from the
|
||||
external KV cache beyond what is already computed.
|
||||
"""
|
||||
return self._lmcache_engine.get_num_new_matched_tokens(
|
||||
request, num_computed_tokens
|
||||
), False
|
||||
|
||||
def update_state_after_alloc(
|
||||
self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
|
||||
):
|
||||
"""
|
||||
Update KVConnector state after block allocation.
|
||||
"""
|
||||
self._lmcache_engine.update_state_after_alloc(request, num_external_tokens)
|
||||
|
||||
def build_connector_meta(
|
||||
self, scheduler_output: SchedulerOutput
|
||||
) -> KVConnectorMetadata:
|
||||
"""
|
||||
Build the connector metadata for this step.
|
||||
|
||||
This function should NOT modify fields in the scheduler_output.
|
||||
Also, calling this function will reset the state of the connector.
|
||||
|
||||
Args:
|
||||
scheduler_output (SchedulerOutput): the scheduler output object.
|
||||
"""
|
||||
return self._lmcache_engine.build_connector_meta(scheduler_output)
|
||||
|
||||
def update_connector_output(self, connector_output: KVConnectorOutput):
|
||||
"""
|
||||
Update KVConnector state from worker-side connectors output.
|
||||
|
||||
Args:
|
||||
connector_output (KVConnectorOutput): the worker-side
|
||||
connectors output.
|
||||
"""
|
||||
# Get the KV events
|
||||
kv_cache_events = connector_output.kv_cache_events
|
||||
if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
|
||||
return
|
||||
|
||||
if self._kv_cache_events is None:
|
||||
self._kv_cache_events = kv_cache_events
|
||||
else:
|
||||
self._kv_cache_events.add_events(kv_cache_events.get_all_events())
|
||||
self._kv_cache_events.increment_workers(
|
||||
kv_cache_events.get_number_of_workers()
|
||||
)
|
||||
return
|
||||
|
||||
def request_finished(
|
||||
self,
|
||||
request: "Request",
|
||||
block_ids: list[int],
|
||||
) -> tuple[bool, dict[str, Any] | None]:
|
||||
"""
|
||||
Called when a request has finished, before its blocks are freed.
|
||||
|
||||
Returns:
|
||||
True if the request is being saved/sent asynchronously and blocks
|
||||
should not be freed until the request_id is returned from
|
||||
get_finished().
|
||||
Optional KVTransferParams to be included in the request outputs
|
||||
returned by the engine.
|
||||
"""
|
||||
return self._lmcache_engine.request_finished(request, block_ids)
|
||||
|
||||
def request_finished_all_groups(
|
||||
self,
|
||||
request: "Request",
|
||||
block_ids: tuple[list[int], ...],
|
||||
) -> tuple[bool, dict[str, Any] | None]:
|
||||
"""
|
||||
Called exactly once when a request has finished for all KV cache
|
||||
groups (HMA support for hybrid Mamba/Attention models).
|
||||
|
||||
LMCache only stores/offloads attention KV cache blocks, so we
|
||||
extract the first group's block IDs and delegate to the
|
||||
single-group request_finished.
|
||||
|
||||
Args:
|
||||
request: the request object.
|
||||
block_ids: tuple of block ID lists, one per KV cache group.
|
||||
|
||||
Returns:
|
||||
Same as request_finished.
|
||||
"""
|
||||
# LMCache only handles attention (first) group blocks.
|
||||
# Mamba SSM state is managed separately by the scheduler.
|
||||
return self.request_finished(request, block_ids[0])
|
||||
|
||||
def take_events(self) -> Iterable["KVCacheEvent"]:
|
||||
"""
|
||||
Take the KV cache events from the connector.
|
||||
|
||||
Yields:
|
||||
New KV cache events since the last call.
|
||||
"""
|
||||
if self._kv_cache_events is not None:
|
||||
self._kv_cache_events.aggregate()
|
||||
kv_cache_events = self._kv_cache_events.get_all_events()
|
||||
yield from kv_cache_events
|
||||
self._kv_cache_events.clear_events()
|
||||
self._kv_cache_events = None
|
||||
Reference in New Issue
Block a user