monkey patch the monkey pathing vllm nonsense

This commit is contained in:
2026-04-15 22:55:00 +00:00
parent f38ffcf115
commit c570c4658e
4 changed files with 1268 additions and 1 deletions

View File

@@ -20,4 +20,10 @@ RUN apt-get update && apt-get install -y git \
COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py
# Monkey patch more vllm stuff - https://github.com/vllm-project/vllm/pull/38237/changes#diff-bee6813076031d3ca1edc903c1b02b81e4676519afc562ce3fefe37f20c7b650
RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
# Patch LMCacheConnectorV1 to support HMA (Hybrid Mamba/Attention KV cache manager)
# This is required for hybrid models like Nemotron that use both Mamba and Attention layers.
# Without this patch, LMCacheConnectorV1 fails with:
# "Connector LMCacheConnectorV1 does not support HMA but HMA is enabled"
COPY ./lmcache_connector.py /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py

View File

@@ -0,0 +1,337 @@
apiVersion: v1
kind: Pod
metadata:
annotations:
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
cni.projectcalico.org/podIP: 10.244.248.111/32
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
k8s.v1.cni.cncf.io/network-status: |-
[{
"name": "k8s-pod-network",
"ips": [
"10.244.248.111",
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
],
"default": true,
"dns": {}
},{
"name": "vllm/ipoib-network-vllm",
"interface": "net1",
"ips": [
"10.66.0.6"
],
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
"dns": {}
}]
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
k8s.v1.cni.cncf.io/networks-status: |-
[{
"name": "k8s-pod-network",
"ips": [
"10.244.248.111",
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
],
"default": true,
"dns": {}
},{
"name": "vllm/ipoib-network-vllm",
"interface": "net1",
"ips": [
"10.66.0.6"
],
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
"dns": {}
}]
creationTimestamp: '2026-04-15T22:38:27Z'
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
generation: 1
labels:
app.kubernetes.io/component: serving-engine
app.kubernetes.io/instance: production-stack-sea-inference
app.kubernetes.io/managed-by: helm
app.kubernetes.io/name: nemotron-3-super
app.kubernetes.io/part-of: vllm-stack
environment: test
helm-release-name: production-stack-sea-inference
model: nemotron-3-super
pod-template-hash: 856dc7d695
release: test
topology.kubernetes.io/region: sea
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
namespace: vllm
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: ReplicaSet
name: >-
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
uid: 88c04723-f29b-432a-8318-21a9d389cac4
resourceVersion: '29767269'
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
spec:
containers:
- command:
- vllm
- serve
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
- '--host'
- 0.0.0.0
- '--port'
- '8000'
- '--no-enable-prefix-caching'
- '--tensor-parallel-size'
- '8'
- '--async-scheduling'
- '--dtype=auto'
- '--attention-backend=TRITON_ATTN'
- '--gpu_memory_utilization=0.96'
- '--enable-auto-tool-choice'
- '--tool-call-parser=qwen3_coder'
- '--trust_remote_code'
- '--max-cudagraph-capture-size=128'
- '--enable-chunked-prefill'
- '--mamba-ssm-cache-dtype=float16'
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
- '--reasoning-parser=super_v3'
- '--max-model-len=1048576'
- '--disable-custom-all-reduce'
- '--no-disable-hybrid-kv-cache-manager'
- '--enforce-eager'
- '--kv-transfer-config'
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: '123'
- name: HF_HOME
value: /tmp
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PROMETHEUS_MULTIPROC_DIR
value: /tmp
- name: OMP_NUM_THREADS
value: '32'
- name: HF_TOKEN
value: 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
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: '1'
- name: NCCL_TOPO_FILE
value: /etc/nccl/virtualTopology.xml
- name: PYTORCH_CUDA_ALLOC_CONF
value: expandable_segments:True
- name: LMCACHE_REMOTE_URL
value: redis://10.66.0.100:6379
- name: LMCACHE_REMOTE_SERDE
value: naive
- name: LMCACHE_USE_EXPERIMENTAL
value: 'True'
- name: VLLM_RPC_TIMEOUT
value: '1000000'
- name: LMCACHE_LOG_LEVEL
value: ERROR
- name: LMCACHE_LOCAL_CPU
value: 'True'
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
value: '512'
- name: LMCACHE_LMCACHE_INSTANCE_ID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: vllm
ports:
- containerPort: 8000
name: container-port
protocol: TCP
- containerPort: 55555
name: zmq-port
protocol: TCP
- containerPort: 9999
name: ucx-port
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
memory: 1500Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
requests:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
securityContext:
runAsNonRoot: false
startupProbe:
failureThreshold: 120
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 60
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dlhrd
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
nodeName: b200-nodepool-d51376abbf32
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- emptyDir:
medium: Memory
sizeLimit: 64Gi
name: shm
- name: kube-api-access-dlhrd
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:32Z'
observedGeneration: 1
status: 'True'
type: PodReadyToStartContainers
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
observedGeneration: 1
status: 'True'
type: Initialized
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
message: 'containers with unready status: [vllm]'
observedGeneration: 1
reason: ContainersNotReady
status: 'False'
type: Ready
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
message: 'containers with unready status: [vllm]'
observedGeneration: 1
reason: ContainersNotReady
status: 'False'
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
observedGeneration: 1
status: 'True'
type: PodScheduled
containerStatuses:
- allocatedResources:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
containerID: >-
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
imageID: >-
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
lastState:
terminated:
containerID: >-
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
exitCode: 1
finishedAt: '2026-04-15T22:42:20Z'
reason: Error
startedAt: '2026-04-15T22:38:31Z'
name: vllm
ready: false
resources:
limits:
memory: 1500Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
requests:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
restartCount: 1
started: false
state:
running:
startedAt: '2026-04-15T22:42:24Z'
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dlhrd
readOnly: true
recursiveReadOnly: Disabled
hostIP: 10.4.96.13
hostIPs:
- ip: 10.4.96.13
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
observedGeneration: 1
phase: Running
podIP: 10.244.248.111
podIPs:
- ip: 10.244.248.111
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
qosClass: Burstable
startTime: '2026-04-15T22:38:27Z'

545
THIS_IS_THE_ERROR_VINNY.md Normal file
View File

@@ -0,0 +1,545 @@
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] █ █ █▄ ▄█
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] █▄█▀ █ █ █ █ model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]
(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:233] non-default args: {'model_tag': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'host': '0.0.0.0', 'model': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'trust_remote_code': True, 'max_model_len': 1048576, 'enforce_eager': True, 'attention_backend': 'TRITON_ATTN', 'reasoning_parser': 'super_v3', 'reasoning_parser_plugin': '/opt/super_v3_reasoning_parser.py', 'tensor_parallel_size': 8, 'disable_custom_all_reduce': True, 'gpu_memory_utilization': 0.96, 'enable_prefix_caching': False, 'mamba_ssm_cache_dtype': 'float16', 'enable_chunked_prefill': True, 'disable_hybrid_kv_cache_manager': False, 'async_scheduling': True, 'max_cudagraph_capture_size': 128, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='dea40998-1518-4361-a31f-884d3c1c1e74', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='fail')}
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_ADDR
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PORT
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PROTO
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PORT
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_HTTP_MONITORING
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_LISTENER_80
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_HOST
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PROTO
(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_ADDR
(APIServer pid=1) A new version of the following files was downloaded from https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4:
(APIServer pid=1) - configuration_nemotron_h.py
(APIServer pid=1) . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
(APIServer pid=1) INFO 04-15 22:38:46 [model.py:549] Resolved architecture: NemotronHForCausalLM
(APIServer pid=1) WARNING 04-15 22:38:46 [model.py:2176] User-specified max_model_len (1048576) is greater than the derived max_model_len (max_position_embeddings=262144.0 or model_max_length=None in model's config.json). VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme caution. If the model uses relative position encoding (RoPE), positions exceeding derived_max_model_len lead to nan. If the model uses absolute position encoding, positions exceeding derived_max_model_len will cause a CUDA array out-of-bounds error.
(APIServer pid=1) INFO 04-15 22:38:46 [model.py:1678] Using max model len 1048576
(APIServer pid=1) INFO 04-15 22:38:46 [cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor.
(APIServer pid=1) INFO 04-15 22:38:46 [scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192.
(APIServer pid=1) INFO 04-15 22:38:46 [config.py:281] Setting attention block size to 1056 tokens to ensure that attention page size is >= mamba page size.
(APIServer pid=1) INFO 04-15 22:38:46 [config.py:312] Padding mamba page size by 0.19% to ensure that mamba page size and attention page size are exactly equal.
(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:381] Detected ModelOpt fp8 checkpoint (quant_algo=FP8). Please note that the format is experimental and could change.
(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:998] Detected ModelOpt NVFP4 checkpoint. Please note that the format is experimental and could change in future.
(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:790] Asynchronous scheduling is enabled.
(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none
(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.
(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:1025] Cudagraph is disabled under eager mode
(APIServer pid=1) INFO 04-15 22:38:51 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant, allreduce_rms
(EngineCore pid=277) INFO 04-15 22:38:58 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', speculative_config=None, tokenizer='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1048576, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=True, quantization=modelopt_mixed, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=fp8_e4m3, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='super_v3', reasoning_parser_plugin='/opt/super_v3_reasoning_parser.py', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore pid=277) INFO 04-15 22:38:58 [multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.244.248.111 (local), world_size=8, local_world_size=8
(Worker pid=348) INFO 04-15 22:39:03 [parallel_state.py:1400] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=415) INFO 04-15 22:39:07 [parallel_state.py:1400] world_size=8 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=483) INFO 04-15 22:39:11 [parallel_state.py:1400] world_size=8 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=556) INFO 04-15 22:39:15 [parallel_state.py:1400] world_size=8 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=629) INFO 04-15 22:39:19 [parallel_state.py:1400] world_size=8 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=702) INFO 04-15 22:39:23 [parallel_state.py:1400] world_size=8 rank=5 local_rank=5 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=775) INFO 04-15 22:39:27 [parallel_state.py:1400] world_size=8 rank=6 local_rank=6 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=848) INFO 04-15 22:39:31 [parallel_state.py:1400] world_size=8 rank=7 local_rank=7 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
(Worker pid=348) INFO 04-15 22:39:31 [pynccl.py:111] vLLM is using nccl==2.28.9
(Worker pid=348) INFO 04-15 22:39:36 [parallel_state.py:1716] rank 0 in world size 8 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
(Worker_TP0 pid=348) INFO 04-15 22:39:37 [gpu_model_runner.py:4735] Starting to load model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4...
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [__init__.py:261] Selected FlashInferFP8ScaledMMLinearKernel for ModelOptFp8LinearMethod
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform.
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4_utils.py:85] Using NvFp4LinearBackend.FLASHINFER_CUTLASS for NVFP4 GEMM
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4.py:256] Using 'FLASHINFER_TRTLLM' NvFp4 MoE backend out of potential backends: ['FLASHINFER_TRTLLM', 'FLASHINFER_CUTEDSL', 'FLASHINFER_CUTLASS', 'VLLM_CUTLASS', 'MARLIN'].
(Worker_TP1 pid=415) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP0 pid=348) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP4 pid=629) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP2 pid=483) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP6 pid=775) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP7 pid=848) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP3 pid=556) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP5 pid=702) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
(Worker_TP1 pid=415) INFO 04-15 22:41:36 [weight_utils.py:581] Time spent downloading weights for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4: 116.048954 seconds
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 0% Completed | 0/17 [00:00<?, ?it/s]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 6% Completed | 1/17 [00:01<00:20, 1.29s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 12% Completed | 2/17 [00:03<00:23, 1.60s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 18% Completed | 3/17 [00:04<00:21, 1.52s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 24% Completed | 4/17 [00:06<00:20, 1.58s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 29% Completed | 5/17 [00:07<00:17, 1.49s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 35% Completed | 6/17 [00:08<00:16, 1.47s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 41% Completed | 7/17 [00:10<00:14, 1.42s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 47% Completed | 8/17 [00:11<00:12, 1.39s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 53% Completed | 9/17 [00:13<00:11, 1.40s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 59% Completed | 10/17 [00:14<00:10, 1.47s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 65% Completed | 11/17 [00:16<00:08, 1.45s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 71% Completed | 12/17 [00:17<00:07, 1.44s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 76% Completed | 13/17 [00:18<00:05, 1.43s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 82% Completed | 14/17 [00:20<00:04, 1.43s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 88% Completed | 15/17 [00:21<00:02, 1.30s/it]
(Worker_TP0 pid=348)
Loading safetensors checkpoint shards: 100% Completed | 17/17 [00:21<00:00, 1.26s/it]
(Worker_TP0 pid=348)
(Worker_TP0 pid=348) INFO 04-15 22:41:59 [default_loader.py:384] Loading weights took 21.38 seconds
(Worker_TP0 pid=348) INFO 04-15 22:41:59 [flashinfer_utils.py:238] Padding intermediate size from 336 to 384 for up/down projection weights.
(Worker_TP0 pid=348) INFO 04-15 22:41:59 [nvfp4.py:401] Using MoEPrepareAndFinalizeNoDPEPMonolithic
(Worker_TP0 pid=348) WARNING 04-15 22:41:59 [kv_cache.py:94] Checkpoint does not provide a q scaling factor. Setting it to k_scale. This only matters for FP8 Attention backends (flash-attn or flashinfer).
(Worker_TP0 pid=348) WARNING 04-15 22:41:59 [kv_cache.py:108] Using KV cache scaling factor 1.0 for fp8_e4m3. If this is unintended, verify that k/v_scale scaling factors are properly set in the checkpoint.
(Worker_TP0 pid=348) INFO 04-15 22:42:01 [gpu_model_runner.py:4820] Model loading took 10.4 GiB memory and 142.225157 seconds
(Worker_TP0 pid=348) INFO 04-15 22:42:10 [gpu_worker.py:436] Available KV cache memory: 158.16 GiB
(EngineCore pid=277) INFO 04-15 22:42:11 [kv_cache_utils.py:1319] GPU KV cache size: 13,819,872 tokens
(EngineCore pid=277) INFO 04-15 22:42:11 [kv_cache_utils.py:1324] Maximum concurrency for 1,048,576 tokens per request: 78.68x
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] output = func(*args, **kwargs)
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] self.worker.initialize_from_config(kv_cache_config) # type: ignore
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] return func(*args, **kwargs)
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] raise ValueError(
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] EngineCore failed to start.
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] Traceback (most recent call last):
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] return func(*args, **kwargs)
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] super().__init__(
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] kv_cache_config = self._initialize_kv_caches(vllm_config)
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] return func(*args, **kwargs)
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 280, in _initialize_kv_caches
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] return aggregate(get_response())
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] ^^^^^^^^^^^^^^
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] raise RuntimeError(
(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] RuntimeError: Worker failed with error 'Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.', please check the stack trace above for the root cause
(Worker_TP3 pid=556) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP0 pid=348) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP6 pid=775) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP7 pid=848) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP4 pid=629) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP2 pid=483) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP1 pid=415) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP5 pid=702) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
(EngineCore pid=277) ERROR 04-15 22:42:14 [multiproc_executor.py:273] Worker proc VllmWorker-3 died unexpectedly, shutting down executor.
(EngineCore pid=277) Process EngineCore:
(EngineCore pid=277) Traceback (most recent call last):
(EngineCore pid=277) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore pid=277) self.run()
(EngineCore pid=277) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore pid=277) self._target(*self._args, **self._kwargs)
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core
(EngineCore pid=277) raise e
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
(EngineCore pid=277) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=277) return func(*args, **kwargs)
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
(EngineCore pid=277) super().__init__(
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
(EngineCore pid=277) kv_cache_config = self._initialize_kv_caches(vllm_config)
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=277) return func(*args, **kwargs)
(EngineCore pid=277) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 280, in _initialize_kv_caches
(EngineCore pid=277) self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
(EngineCore pid=277) self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
(EngineCore pid=277) return aggregate(get_response())
(EngineCore pid=277) ^^^^^^^^^^^^^^
(EngineCore pid=277) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
(EngineCore pid=277) raise RuntimeError(
(EngineCore pid=277) RuntimeError: Worker failed with error 'Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.', please check the stack trace above for the root cause
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in <module>
(APIServer pid=1) sys.exit(main())
(APIServer pid=1) ^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main
(APIServer pid=1) args.dispatch_function(args)
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd
(APIServer pid=1) uvloop.run(run_server(args))
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
(APIServer pid=1) return __asyncio.run(
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=1) return runner.run(main)
(APIServer pid=1) ^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=1) return self._loop.run_until_complete(task)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
(APIServer pid=1) return await main
(APIServer pid=1) ^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker
(APIServer pid=1) async with build_async_engine_client(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1) return await anext(self.gen)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client
(APIServer pid=1) async with build_async_engine_client_from_engine_args(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1) return await anext(self.gen)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args
(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config
(APIServer pid=1) return cls(
(APIServer pid=1) ^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__
(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(APIServer pid=1) return func(*args, **kwargs)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client
(APIServer pid=1) return AsyncMPClient(*client_args)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(APIServer pid=1) return func(*args, **kwargs)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__
(APIServer pid=1) super().__init__(
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__
(APIServer pid=1) with launch_core_engines(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=1) next(self.gen)
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines
(APIServer pid=1) wait_for_engine_startup(
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup
(APIServer pid=1) raise RuntimeError(
(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 8 leaked shared_memory objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '

379
lmcache_connector.py Normal file
View File

@@ -0,0 +1,379 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any
import torch
from vllm.config import VllmConfig
from vllm.distributed.kv_events import (
BlockStored,
KVCacheEvent,
KVConnectorKVEvents,
KVEventAggregator,
)
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1,
KVConnectorMetadata,
KVConnectorRole,
SupportsHMA,
)
from vllm.logger import init_logger
from vllm.v1.attention.backend import AttentionMetadata
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING:
from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request
logger = init_logger(__name__)
class LMCacheKVEvents(KVConnectorKVEvents):
"""
Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
"""
def __init__(self, num_workers: int) -> None:
self._aggregator = KVEventAggregator(num_workers)
def add_events(self, events: list[KVCacheEvent]) -> None:
self._aggregator.add_events(events)
def aggregate(self) -> "LMCacheKVEvents":
"""
Aggregate KV events and retain only common events.
"""
common_events = self._aggregator.get_common_events()
self._aggregator.clear_events()
self._aggregator.add_events(common_events)
self._aggregator.reset_workers()
return self
def increment_workers(self, count: int = 1) -> None:
self._aggregator.increment_workers(count)
def get_all_events(self) -> list[KVCacheEvent]:
return self._aggregator.get_all_events()
def get_number_of_workers(self) -> int:
return self._aggregator.get_number_of_workers()
def clear_events(self) -> None:
self._aggregator.clear_events()
self._aggregator.reset_workers()
def __repr__(self) -> str:
return f"<LMCacheKVEvents events={self.get_all_events()}>"
class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
@classmethod
def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
"""
LMCache requires PIECEWISE CUDA graph mode when layerwise
operations are enabled. The wait_for_layer_load and save_kv_layer
methods perform actual async synchronization that cannot be
captured in CUDA graphs.
"""
return extra_config.get("use_layerwise", False)
def __init__(
self,
vllm_config: "VllmConfig",
role: KVConnectorRole,
kv_cache_config: "KVCacheConfig",
):
super().__init__(
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
)
assert vllm_config.kv_transfer_config is not None
use_native = vllm_config.kv_transfer_config.get_from_extra_config(
"use_native", False
)
if use_native:
logger.info("Initializing native LMCache connector")
# lazy import
from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration
_adapter = lmcache_integration.vllm_v1_adapter
cls = _adapter.LMCacheConnectorV1Impl
else:
logger.info("Initializing latest dev LMCache connector")
# lazy import
from lmcache.integration.vllm.vllm_v1_adapter import (
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
)
cls = LMCacheConnectorLatestImpl
self._lmcache_engine = cls(vllm_config, role, self)
self._kv_cache_events: LMCacheKVEvents | None = None
# ==============================
# Worker-side methods
# ==============================
def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
"""
Initialize with the KV caches. Useful for pre-registering the
KV Caches in the KVConnector (e.g. for NIXL).
Args:
kv_caches: dictionary of layer names, kv cache
"""
if hasattr(self._lmcache_engine, "register_kv_caches"):
self._lmcache_engine.register_kv_caches(kv_caches)
else:
logger.warning(
"LMCache engine does not support register_kv_caches, "
"please check and use the latest version"
)
def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
"""
Start loading the KV cache from the connector to vLLM's paged
KV buffer. This is called from the forward context before the
forward pass to enable async loading during model execution.
Args:
forward_context (ForwardContext): the forward context.
**kwargs: additional arguments for the load operation
Note:
The number of elements in kv_caches and layer_names should be
the same.
"""
self._lmcache_engine.start_load_kv(forward_context, **kwargs)
def wait_for_layer_load(self, layer_name: str) -> None:
"""
Block until the KV for a specific layer is loaded into vLLM's
paged buffer. This is called from within attention layer to ensure
async copying from start_load_kv is complete.
This interface will be useful for layer-by-layer pipelining.
Args:
layer_name: the name of that layer
"""
self._lmcache_engine.wait_for_layer_load(layer_name)
def save_kv_layer(
self,
layer_name: str,
kv_layer: torch.Tensor,
attn_metadata: AttentionMetadata,
**kwargs: Any,
) -> None:
"""
Start saving the a layer of KV cache from vLLM's paged buffer
to the connector. This is called from within attention layer to
enable async copying during execution.
Args:
layer_name (str): the name of the layer.
kv_layer (torch.Tensor): the paged KV buffer of the current
layer in vLLM.
attn_metadata (AttentionMetadata): the attention metadata.
**kwargs: additional arguments for the save operation.
"""
self._lmcache_engine.save_kv_layer(
layer_name, kv_layer, attn_metadata, **kwargs
)
def wait_for_save(self):
"""
Block until all the save operations is done. This is called
as the forward context exits to ensure that the async saving
from save_kv_layer is complete before finishing the forward.
This prevents overwrites of paged KV buffer before saving done.
"""
self._lmcache_engine.wait_for_save()
def get_finished(
self, finished_req_ids: set[str]
) -> tuple[set[str] | None, set[str] | None]:
"""
Notifies worker-side connector ids of requests that have
finished generating tokens.
Returns:
ids of requests that have finished asynchronous transfer
(requests that previously returned True from request_finished()),
tuple of (sending/saving ids, recving/loading ids).
The finished saves/sends req ids must belong to a set provided in a
call to this method (this call or a prior one).
"""
return self._lmcache_engine.get_finished(finished_req_ids)
def get_block_ids_with_load_errors(self) -> set[int]:
"""
Get the set of block IDs that failed to load.
Returns:
Set of block IDs that encountered load errors.
Empty set if no load errors occurred.
"""
method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
if callable(method):
return method()
# Fallback for older versions that don't support this method
return set()
def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
"""
Get the KV connector kv cache events collected during the last interval.
"""
events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined]
if not events:
return None
blocks: list[BlockStored] = [
BlockStored(
block_hashes=e.block_hashes,
parent_block_hash=e.parent_block_hash,
token_ids=e.token_ids,
lora_id=e.lora_id,
block_size=e.block_size,
medium=e.medium,
lora_name=getattr(e, "lora_name", None),
)
for e in events
]
lmcache_kv_events = LMCacheKVEvents(num_workers=1)
lmcache_kv_events.add_events(blocks)
return lmcache_kv_events
# ==============================
# Scheduler-side methods
# ==============================
def get_num_new_matched_tokens(
self,
request: "Request",
num_computed_tokens: int,
) -> tuple[int | None, bool]:
"""
Get number of new tokens that can be loaded from the
external KV cache beyond the num_computed_tokens.
Args:
request (Request): the request object.
num_computed_tokens (int): the number of locally
computed tokens for this request
Returns:
the number of tokens that can be loaded from the
external KV cache beyond what is already computed.
"""
return self._lmcache_engine.get_num_new_matched_tokens(
request, num_computed_tokens
), False
def update_state_after_alloc(
self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
):
"""
Update KVConnector state after block allocation.
"""
self._lmcache_engine.update_state_after_alloc(request, num_external_tokens)
def build_connector_meta(
self, scheduler_output: SchedulerOutput
) -> KVConnectorMetadata:
"""
Build the connector metadata for this step.
This function should NOT modify fields in the scheduler_output.
Also, calling this function will reset the state of the connector.
Args:
scheduler_output (SchedulerOutput): the scheduler output object.
"""
return self._lmcache_engine.build_connector_meta(scheduler_output)
def update_connector_output(self, connector_output: KVConnectorOutput):
"""
Update KVConnector state from worker-side connectors output.
Args:
connector_output (KVConnectorOutput): the worker-side
connectors output.
"""
# Get the KV events
kv_cache_events = connector_output.kv_cache_events
if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
return
if self._kv_cache_events is None:
self._kv_cache_events = kv_cache_events
else:
self._kv_cache_events.add_events(kv_cache_events.get_all_events())
self._kv_cache_events.increment_workers(
kv_cache_events.get_number_of_workers()
)
return
def request_finished(
self,
request: "Request",
block_ids: list[int],
) -> tuple[bool, dict[str, Any] | None]:
"""
Called when a request has finished, before its blocks are freed.
Returns:
True if the request is being saved/sent asynchronously and blocks
should not be freed until the request_id is returned from
get_finished().
Optional KVTransferParams to be included in the request outputs
returned by the engine.
"""
return self._lmcache_engine.request_finished(request, block_ids)
def request_finished_all_groups(
self,
request: "Request",
block_ids: tuple[list[int], ...],
) -> tuple[bool, dict[str, Any] | None]:
"""
Called exactly once when a request has finished for all KV cache
groups (HMA support for hybrid Mamba/Attention models).
LMCache only stores/offloads attention KV cache blocks, so we
extract the first group's block IDs and delegate to the
single-group request_finished.
Args:
request: the request object.
block_ids: tuple of block ID lists, one per KV cache group.
Returns:
Same as request_finished.
"""
# LMCache only handles attention (first) group blocks.
# Mamba SSM state is managed separately by the scheduler.
return self.request_finished(request, block_ids[0])
def take_events(self) -> Iterable["KVCacheEvent"]:
"""
Take the KV cache events from the connector.
Yields:
New KV cache events since the last call.
"""
if self._kv_cache_events is not None:
self._kv_cache_events.aggregate()
kv_cache_events = self._kv_cache_events.get_all_events()
yield from kv_cache_events
self._kv_cache_events.clear_events()
self._kv_cache_events = None