monkey patch the monkey pathing vllm nonsense

2026-04-15 22:55:00 +00:00
parent f38ffcf115
commit c570c4658e
4 changed files with 1268 additions and 1 deletions
--- a/8
+++ b/8
@@ -20,4 +20,10 @@ RUN apt-get update && apt-get install -y git \
 COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py

 # Monkey patch more vllm stuff - https://github.com/vllm-project/vllm/pull/38237/changes#diff-bee6813076031d3ca1edc903c1b02b81e4676519afc562ce3fefe37f20c7b650
-RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
+RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py
+
+# Patch LMCacheConnectorV1 to support HMA (Hybrid Mamba/Attention KV cache manager)
+# This is required for hybrid models like Nemotron that use both Mamba and Attention layers.
+# Without this patch, LMCacheConnectorV1 fails with:
+#   "Connector LMCacheConnectorV1 does not support HMA but HMA is enabled"
+COPY ./lmcache_connector.py /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
--- a/THIS_IS_MY_POD_SUMMARY_VINNY.md
+++ b/THIS_IS_MY_POD_SUMMARY_VINNY.md
@@ -0,0 +1,337 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  annotations:
+    cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
+    cni.projectcalico.org/podIP: 10.244.248.111/32
+    cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
+    k8s.v1.cni.cncf.io/network-status: |-
+      [{
+          "name": "k8s-pod-network",
+          "ips": [
+              "10.244.248.111",
+              "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
+          ],
+          "default": true,
+          "dns": {}
+      },{
+          "name": "vllm/ipoib-network-vllm",
+          "interface": "net1",
+          "ips": [
+              "10.66.0.6"
+          ],
+          "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
+          "dns": {}
+      }]
+    k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
+    k8s.v1.cni.cncf.io/networks-status: |-
+      [{
+          "name": "k8s-pod-network",
+          "ips": [
+              "10.244.248.111",
+              "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
+          ],
+          "default": true,
+          "dns": {}
+      },{
+          "name": "vllm/ipoib-network-vllm",
+          "interface": "net1",
+          "ips": [
+              "10.66.0.6"
+          ],
+          "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
+          "dns": {}
+      }]
+  creationTimestamp: '2026-04-15T22:38:27Z'
+  generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
+  generation: 1
+  labels:
+    app.kubernetes.io/component: serving-engine
+    app.kubernetes.io/instance: production-stack-sea-inference
+    app.kubernetes.io/managed-by: helm
+    app.kubernetes.io/name: nemotron-3-super
+    app.kubernetes.io/part-of: vllm-stack
+    environment: test
+    helm-release-name: production-stack-sea-inference
+    model: nemotron-3-super
+    pod-template-hash: 856dc7d695
+    release: test
+    topology.kubernetes.io/region: sea
+  name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
+  namespace: vllm
+  ownerReferences:
+    - apiVersion: apps/v1
+      blockOwnerDeletion: true
+      controller: true
+      kind: ReplicaSet
+      name: >-
+        production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
+      uid: 88c04723-f29b-432a-8318-21a9d389cac4
+  resourceVersion: '29767269'
+  uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
+spec:
+  containers:
+    - command:
+        - vllm
+        - serve
+        - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+        - '--host'
+        - 0.0.0.0
+        - '--port'
+        - '8000'
+        - '--no-enable-prefix-caching'
+        - '--tensor-parallel-size'
+        - '8'
+        - '--async-scheduling'
+        - '--dtype=auto'
+        - '--attention-backend=TRITON_ATTN'
+        - '--gpu_memory_utilization=0.96'
+        - '--enable-auto-tool-choice'
+        - '--tool-call-parser=qwen3_coder'
+        - '--trust_remote_code'
+        - '--max-cudagraph-capture-size=128'
+        - '--enable-chunked-prefill'
+        - '--mamba-ssm-cache-dtype=float16'
+        - '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
+        - '--reasoning-parser=super_v3'
+        - '--max-model-len=1048576'
+        - '--disable-custom-all-reduce'
+        - '--no-disable-hybrid-kv-cache-manager'
+        - '--enforce-eager'
+        - '--kv-transfer-config'
+        - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
+      env:
+        - name: PYTHONHASHSEED
+          value: '123'
+        - name: HF_HOME
+          value: /tmp
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: status.podIP
+        - name: PROMETHEUS_MULTIPROC_DIR
+          value: /tmp
+        - name: OMP_NUM_THREADS
+          value: '32'
+        - name: HF_TOKEN
+          value: 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
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: '1'
+        - name: NCCL_TOPO_FILE
+          value: /etc/nccl/virtualTopology.xml
+        - name: PYTORCH_CUDA_ALLOC_CONF
+          value: expandable_segments:True
+        - name: LMCACHE_REMOTE_URL
+          value: redis://10.66.0.100:6379
+        - name: LMCACHE_REMOTE_SERDE
+          value: naive
+        - name: LMCACHE_USE_EXPERIMENTAL
+          value: 'True'
+        - name: VLLM_RPC_TIMEOUT
+          value: '1000000'
+        - name: LMCACHE_LOG_LEVEL
+          value: ERROR
+        - name: LMCACHE_LOCAL_CPU
+          value: 'True'
+        - name: LMCACHE_MAX_LOCAL_CPU_SIZE
+          value: '512'
+        - name: LMCACHE_LMCACHE_INSTANCE_ID
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.name
+      image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
+      imagePullPolicy: Always
+      livenessProbe:
+        failureThreshold: 3
+        httpGet:
+          path: /health
+          port: 8000
+          scheme: HTTP
+        initialDelaySeconds: 15
+        periodSeconds: 10
+        successThreshold: 1
+        timeoutSeconds: 1
+      name: vllm
+      ports:
+        - containerPort: 8000
+          name: container-port
+          protocol: TCP
+        - containerPort: 55555
+          name: zmq-port
+          protocol: TCP
+        - containerPort: 9999
+          name: ucx-port
+          protocol: TCP
+      readinessProbe:
+        failureThreshold: 3
+        httpGet:
+          path: /health
+          port: 8000
+          scheme: HTTP
+        initialDelaySeconds: 15
+        periodSeconds: 5
+        successThreshold: 1
+        timeoutSeconds: 1
+      resources:
+        limits:
+          memory: 1500Gi
+          nvidia.com/gpu: '8'
+          rdma/ib: '1'
+        requests:
+          cpu: '8'
+          memory: 16Gi
+          nvidia.com/gpu: '8'
+          rdma/ib: '1'
+      securityContext:
+        runAsNonRoot: false
+      startupProbe:
+        failureThreshold: 120
+        httpGet:
+          path: /health
+          port: 8000
+          scheme: HTTP
+        initialDelaySeconds: 30
+        periodSeconds: 60
+        successThreshold: 1
+        timeoutSeconds: 1
+      terminationMessagePath: /dev/termination-log
+      terminationMessagePolicy: File
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: shm
+        - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+          name: kube-api-access-dlhrd
+          readOnly: true
+  dnsPolicy: ClusterFirst
+  enableServiceLinks: true
+  hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
+  nodeName: b200-nodepool-d51376abbf32
+  preemptionPolicy: PreemptLowerPriority
+  priority: 0
+  restartPolicy: Always
+  schedulerName: default-scheduler
+  securityContext: {}
+  serviceAccount: default
+  serviceAccountName: default
+  subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
+  terminationGracePeriodSeconds: 30
+  tolerations:
+    - effect: NoExecute
+      key: node.kubernetes.io/not-ready
+      operator: Exists
+      tolerationSeconds: 300
+    - effect: NoExecute
+      key: node.kubernetes.io/unreachable
+      operator: Exists
+      tolerationSeconds: 300
+  volumes:
+    - emptyDir:
+        medium: Memory
+        sizeLimit: 64Gi
+      name: shm
+    - name: kube-api-access-dlhrd
+      projected:
+        defaultMode: 420
+        sources:
+          - serviceAccountToken:
+              expirationSeconds: 3607
+              path: token
+          - configMap:
+              items:
+                - key: ca.crt
+                  path: ca.crt
+              name: kube-root-ca.crt
+          - downwardAPI:
+              items:
+                - fieldRef:
+                    apiVersion: v1
+                    fieldPath: metadata.namespace
+                  path: namespace
+status:
+  conditions:
+    - lastProbeTime: null
+      lastTransitionTime: '2026-04-15T22:38:32Z'
+      observedGeneration: 1
+      status: 'True'
+      type: PodReadyToStartContainers
+    - lastProbeTime: null
+      lastTransitionTime: '2026-04-15T22:38:27Z'
+      observedGeneration: 1
+      status: 'True'
+      type: Initialized
+    - lastProbeTime: null
+      lastTransitionTime: '2026-04-15T22:38:27Z'
+      message: 'containers with unready status: [vllm]'
+      observedGeneration: 1
+      reason: ContainersNotReady
+      status: 'False'
+      type: Ready
+    - lastProbeTime: null
+      lastTransitionTime: '2026-04-15T22:38:27Z'
+      message: 'containers with unready status: [vllm]'
+      observedGeneration: 1
+      reason: ContainersNotReady
+      status: 'False'
+      type: ContainersReady
+    - lastProbeTime: null
+      lastTransitionTime: '2026-04-15T22:38:27Z'
+      observedGeneration: 1
+      status: 'True'
+      type: PodScheduled
+  containerStatuses:
+    - allocatedResources:
+        cpu: '8'
+        memory: 16Gi
+        nvidia.com/gpu: '8'
+        rdma/ib: '1'
+      containerID: >-
+        containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
+      image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
+      imageID: >-
+        atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
+      lastState:
+        terminated:
+          containerID: >-
+            containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
+          exitCode: 1
+          finishedAt: '2026-04-15T22:42:20Z'
+          reason: Error
+          startedAt: '2026-04-15T22:38:31Z'
+      name: vllm
+      ready: false
+      resources:
+        limits:
+          memory: 1500Gi
+          nvidia.com/gpu: '8'
+          rdma/ib: '1'
+        requests:
+          cpu: '8'
+          memory: 16Gi
+          nvidia.com/gpu: '8'
+          rdma/ib: '1'
+      restartCount: 1
+      started: false
+      state:
+        running:
+          startedAt: '2026-04-15T22:42:24Z'
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: shm
+        - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+          name: kube-api-access-dlhrd
+          readOnly: true
+          recursiveReadOnly: Disabled
+  hostIP: 10.4.96.13
+  hostIPs:
+    - ip: 10.4.96.13
+    - ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
+  observedGeneration: 1
+  phase: Running
+  podIP: 10.244.248.111
+  podIPs:
+    - ip: 10.244.248.111
+    - ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
+  qosClass: Burstable
+  startTime: '2026-04-15T22:38:27Z'
--- a/THIS_IS_THE_ERROR_VINNY.md
+++ b/THIS_IS_THE_ERROR_VINNY.md
@@ -0,0 +1,545 @@
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]        █     █     █▄   ▄█
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]  ▄▄ ▄█ █     █     █ ▀▄▀ █  version 0.19.0
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]   █▄█▀ █     █     █     █  model   nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]    ▀▀  ▀▀▀▀▀ ▀▀▀▀▀ ▀     ▀
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299]
+(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:233] non-default args: {'model_tag': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'host': '0.0.0.0', 'model': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'trust_remote_code': True, 'max_model_len': 1048576, 'enforce_eager': True, 'attention_backend': 'TRITON_ATTN', 'reasoning_parser': 'super_v3', 'reasoning_parser_plugin': '/opt/super_v3_reasoning_parser.py', 'tensor_parallel_size': 8, 'disable_custom_all_reduce': True, 'gpu_memory_utilization': 0.96, 'enable_prefix_caching': False, 'mamba_ssm_cache_dtype': 'float16', 'enable_chunked_prefill': True, 'disable_hybrid_kv_cache_manager': False, 'async_scheduling': True, 'max_cudagraph_capture_size': 128, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='dea40998-1518-4361-a31f-884d3c1c1e74', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='fail')}
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_ADDR
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PORT
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PROTO
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PORT
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_HTTP_MONITORING
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_LISTENER_80
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_HOST
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PROTO
+(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_ADDR
+(APIServer pid=1) A new version of the following files was downloaded from https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4:
+(APIServer pid=1) - configuration_nemotron_h.py
+(APIServer pid=1) . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
+(APIServer pid=1) INFO 04-15 22:38:46 [model.py:549] Resolved architecture: NemotronHForCausalLM
+(APIServer pid=1) WARNING 04-15 22:38:46 [model.py:2176] User-specified max_model_len (1048576) is greater than the derived max_model_len (max_position_embeddings=262144.0 or model_max_length=None in model's config.json). VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme caution. If the model uses relative position encoding (RoPE), positions exceeding derived_max_model_len lead to nan. If the model uses absolute position encoding, positions exceeding derived_max_model_len will cause a CUDA array out-of-bounds error.
+(APIServer pid=1) INFO 04-15 22:38:46 [model.py:1678] Using max model len 1048576
+(APIServer pid=1) INFO 04-15 22:38:46 [cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor.
+(APIServer pid=1) INFO 04-15 22:38:46 [scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192.
+(APIServer pid=1) INFO 04-15 22:38:46 [config.py:281] Setting attention block size to 1056 tokens to ensure that attention page size is >= mamba page size.
+(APIServer pid=1) INFO 04-15 22:38:46 [config.py:312] Padding mamba page size by 0.19% to ensure that mamba page size and attention page size are exactly equal.
+(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:381] Detected ModelOpt fp8 checkpoint (quant_algo=FP8). Please note that the format is experimental and could change.
+(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:998] Detected ModelOpt NVFP4 checkpoint. Please note that the format is experimental and could change in future.
+(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:790] Asynchronous scheduling is enabled.
+(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none
+(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.
+(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:1025] Cudagraph is disabled under eager mode
+(APIServer pid=1) INFO 04-15 22:38:51 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant, allreduce_rms
+(EngineCore pid=277) INFO 04-15 22:38:58 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', speculative_config=None, tokenizer='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1048576, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=True, quantization=modelopt_mixed, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=fp8_e4m3, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='super_v3', reasoning_parser_plugin='/opt/super_v3_reasoning_parser.py', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
+(EngineCore pid=277) INFO 04-15 22:38:58 [multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.244.248.111 (local), world_size=8, local_world_size=8
+(Worker pid=348) INFO 04-15 22:39:03 [parallel_state.py:1400] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=415) INFO 04-15 22:39:07 [parallel_state.py:1400] world_size=8 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=483) INFO 04-15 22:39:11 [parallel_state.py:1400] world_size=8 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=556) INFO 04-15 22:39:15 [parallel_state.py:1400] world_size=8 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=629) INFO 04-15 22:39:19 [parallel_state.py:1400] world_size=8 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=702) INFO 04-15 22:39:23 [parallel_state.py:1400] world_size=8 rank=5 local_rank=5 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=775) INFO 04-15 22:39:27 [parallel_state.py:1400] world_size=8 rank=6 local_rank=6 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=848) INFO 04-15 22:39:31 [parallel_state.py:1400] world_size=8 rank=7 local_rank=7 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl
+(Worker pid=348) INFO 04-15 22:39:31 [pynccl.py:111] vLLM is using nccl==2.28.9
+(Worker pid=348) INFO 04-15 22:39:36 [parallel_state.py:1716] rank 0 in world size 8 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
+(Worker_TP0 pid=348) INFO 04-15 22:39:37 [gpu_model_runner.py:4735] Starting to load model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4...
+(Worker_TP0 pid=348) INFO 04-15 22:39:38 [__init__.py:261] Selected FlashInferFP8ScaledMMLinearKernel for ModelOptFp8LinearMethod
+(Worker_TP0 pid=348) INFO 04-15 22:39:38 [deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform.
+(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4_utils.py:85] Using NvFp4LinearBackend.FLASHINFER_CUTLASS for NVFP4 GEMM
+(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4.py:256] Using 'FLASHINFER_TRTLLM' NvFp4 MoE backend out of potential backends: ['FLASHINFER_TRTLLM', 'FLASHINFER_CUTEDSL', 'FLASHINFER_CUTLASS', 'VLLM_CUTLASS', 'MARLIN'].
+(Worker_TP1 pid=415) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP0 pid=348) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP4 pid=629) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP2 pid=483) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP6 pid=775) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP7 pid=848) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP3 pid=556) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP5 pid=702) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend.
+(Worker_TP1 pid=415) INFO 04-15 22:41:36 [weight_utils.py:581] Time spent downloading weights for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4: 116.048954 seconds
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:   6% Completed | 1/17 [00:01<00:20,  1.29s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  12% Completed | 2/17 [00:03<00:23,  1.60s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  18% Completed | 3/17 [00:04<00:21,  1.52s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  24% Completed | 4/17 [00:06<00:20,  1.58s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  29% Completed | 5/17 [00:07<00:17,  1.49s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  35% Completed | 6/17 [00:08<00:16,  1.47s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  41% Completed | 7/17 [00:10<00:14,  1.42s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  47% Completed | 8/17 [00:11<00:12,  1.39s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  53% Completed | 9/17 [00:13<00:11,  1.40s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  59% Completed | 10/17 [00:14<00:10,  1.47s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  65% Completed | 11/17 [00:16<00:08,  1.45s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  71% Completed | 12/17 [00:17<00:07,  1.44s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  76% Completed | 13/17 [00:18<00:05,  1.43s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  82% Completed | 14/17 [00:20<00:04,  1.43s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards:  88% Completed | 15/17 [00:21<00:02,  1.30s/it]
+(Worker_TP0 pid=348) 
+Loading safetensors checkpoint shards: 100% Completed | 17/17 [00:21<00:00,  1.26s/it]
+(Worker_TP0 pid=348)
+(Worker_TP0 pid=348) INFO 04-15 22:41:59 [default_loader.py:384] Loading weights took 21.38 seconds
+(Worker_TP0 pid=348) INFO 04-15 22:41:59 [flashinfer_utils.py:238] Padding intermediate size from 336 to 384 for up/down projection weights.
+(Worker_TP0 pid=348) INFO 04-15 22:41:59 [nvfp4.py:401] Using MoEPrepareAndFinalizeNoDPEPMonolithic
+(Worker_TP0 pid=348) WARNING 04-15 22:41:59 [kv_cache.py:94] Checkpoint does not provide a q scaling factor. Setting it to k_scale. This only matters for FP8 Attention backends (flash-attn or flashinfer).
+(Worker_TP0 pid=348) WARNING 04-15 22:41:59 [kv_cache.py:108] Using KV cache scaling factor 1.0 for fp8_e4m3. If this is unintended, verify that k/v_scale scaling factors are properly set in the checkpoint.
+(Worker_TP0 pid=348) INFO 04-15 22:42:01 [gpu_model_runner.py:4820] Model loading took 10.4 GiB memory and 142.225157 seconds
+(Worker_TP0 pid=348) INFO 04-15 22:42:10 [gpu_worker.py:436] Available KV cache memory: 158.16 GiB
+(EngineCore pid=277) INFO 04-15 22:42:11 [kv_cache_utils.py:1319] GPU KV cache size: 13,819,872 tokens
+(EngineCore pid=277) INFO 04-15 22:42:11 [kv_cache_utils.py:1324] Maximum concurrency for 1,048,576 tokens per request: 78.68x
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP7 pid=848) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP6 pid=775) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP3 pid=556) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP0 pid=348) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP4 pid=629) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP2 pid=483) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP5 pid=702) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] WorkerProc hit an exception.
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] Traceback (most recent call last):
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 944, in worker_busy_loop
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     output = func(*args, **kwargs)
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]              ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     return func(*args, **kwargs)
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]            ^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 527, in initialize_from_config
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config)
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_transfer_state.py", line 67, in ensure_kv_transfer_initialized
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector(
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]   File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/factory.py", line 59, in create_connector
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]     raise ValueError(
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949] ValueError: Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.
+(Worker_TP1 pid=415) ERROR 04-15 22:42:11 [multiproc_executor.py:949]
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] EngineCore failed to start.
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] Traceback (most recent call last):
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     return func(*args, **kwargs)
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]            ^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     super().__init__(
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     kv_cache_config = self._initialize_kv_caches(vllm_config)
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     return func(*args, **kwargs)
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]            ^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 280, in _initialize_kv_caches
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     self.model_executor.initialize_from_config(kv_cache_configs)
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     return aggregate(get_response())
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]                      ^^^^^^^^^^^^^^
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108]     raise RuntimeError(
+(EngineCore pid=277) ERROR 04-15 22:42:11 [core.py:1108] RuntimeError: Worker failed with error 'Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.', please check the stack trace above for the root cause
+(Worker_TP3 pid=556) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP0 pid=348) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP6 pid=775) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP7 pid=848) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP4 pid=629) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP2 pid=483) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP1 pid=415) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(Worker_TP5 pid=702) WARNING 04-15 22:42:11 [multiproc_executor.py:871] WorkerProc was terminated
+(EngineCore pid=277) ERROR 04-15 22:42:14 [multiproc_executor.py:273] Worker proc VllmWorker-3 died unexpectedly, shutting down executor.
+(EngineCore pid=277) Process EngineCore:
+(EngineCore pid=277) Traceback (most recent call last):
+(EngineCore pid=277)   File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
+(EngineCore pid=277)     self.run()
+(EngineCore pid=277)   File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
+(EngineCore pid=277)     self._target(*self._args, **self._kwargs)
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core
+(EngineCore pid=277)     raise e
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
+(EngineCore pid=277)     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
+(EngineCore pid=277)                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(EngineCore pid=277)     return func(*args, **kwargs)
+(EngineCore pid=277)            ^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__
+(EngineCore pid=277)     super().__init__(
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 124, in __init__
+(EngineCore pid=277)     kv_cache_config = self._initialize_kv_caches(vllm_config)
+(EngineCore pid=277)                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(EngineCore pid=277)     return func(*args, **kwargs)
+(EngineCore pid=277)            ^^^^^^^^^^^^^^^^^^^^^
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 280, in _initialize_kv_caches
+(EngineCore pid=277)     self.model_executor.initialize_from_config(kv_cache_configs)
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
+(EngineCore pid=277)     self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
+(EngineCore pid=277)     return aggregate(get_response())
+(EngineCore pid=277)                      ^^^^^^^^^^^^^^
+(EngineCore pid=277)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
+(EngineCore pid=277)     raise RuntimeError(
+(EngineCore pid=277) RuntimeError: Worker failed with error 'Connector LMCacheConnectorV1 does not support HMA but HMA is enabled. Please set `--disable-hybrid-kv-cache-manager`.', please check the stack trace above for the root cause
+(APIServer pid=1) Traceback (most recent call last):
+(APIServer pid=1)   File "/usr/local/bin/vllm", line 10, in <module>
+(APIServer pid=1)     sys.exit(main())
+(APIServer pid=1)              ^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main
+(APIServer pid=1)     args.dispatch_function(args)
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd
+(APIServer pid=1)     uvloop.run(run_server(args))
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
+(APIServer pid=1)     return __asyncio.run(
+(APIServer pid=1)            ^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
+(APIServer pid=1)     return runner.run(main)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
+(APIServer pid=1)     return self._loop.run_until_complete(task)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
+(APIServer pid=1)     return await main
+(APIServer pid=1)            ^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server
+(APIServer pid=1)     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker
+(APIServer pid=1)     async with build_async_engine_client(
+(APIServer pid=1)                ^^^^^^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
+(APIServer pid=1)     return await anext(self.gen)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client
+(APIServer pid=1)     async with build_async_engine_client_from_engine_args(
+(APIServer pid=1)                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
+(APIServer pid=1)     return await anext(self.gen)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args
+(APIServer pid=1)     async_llm = AsyncLLM.from_vllm_config(
+(APIServer pid=1)                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config
+(APIServer pid=1)     return cls(
+(APIServer pid=1)            ^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__
+(APIServer pid=1)     self.engine_core = EngineCoreClient.make_async_mp_client(
+(APIServer pid=1)                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(APIServer pid=1)     return func(*args, **kwargs)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client
+(APIServer pid=1)     return AsyncMPClient(*client_args)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
+(APIServer pid=1)     return func(*args, **kwargs)
+(APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__
+(APIServer pid=1)     super().__init__(
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__
+(APIServer pid=1)     with launch_core_engines(
+(APIServer pid=1)          ^^^^^^^^^^^^^^^^^^^^
+(APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
+(APIServer pid=1)     next(self.gen)
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines
+(APIServer pid=1)     wait_for_engine_startup(
+(APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup
+(APIServer pid=1)     raise RuntimeError(
+(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
+/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 8 leaked shared_memory objects to clean up at shutdown
+  warnings.warn('resource_tracker: There appear to be %d '
--- a/lmcache_connector.py
+++ b/lmcache_connector.py
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_events import (
+    BlockStored,
+    KVCacheEvent,
+    KVConnectorKVEvents,
+    KVEventAggregator,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+    SupportsHMA,
+)
+from vllm.logger import init_logger
+from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
+
+if TYPE_CHECKING:
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class LMCacheKVEvents(KVConnectorKVEvents):
+    """
+    Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
+    """
+
+    def __init__(self, num_workers: int) -> None:
+        self._aggregator = KVEventAggregator(num_workers)
+
+    def add_events(self, events: list[KVCacheEvent]) -> None:
+        self._aggregator.add_events(events)
+
+    def aggregate(self) -> "LMCacheKVEvents":
+        """
+        Aggregate KV events and retain only common events.
+        """
+        common_events = self._aggregator.get_common_events()
+        self._aggregator.clear_events()
+        self._aggregator.add_events(common_events)
+        self._aggregator.reset_workers()
+        return self
+
+    def increment_workers(self, count: int = 1) -> None:
+        self._aggregator.increment_workers(count)
+
+    def get_all_events(self) -> list[KVCacheEvent]:
+        return self._aggregator.get_all_events()
+
+    def get_number_of_workers(self) -> int:
+        return self._aggregator.get_number_of_workers()
+
+    def clear_events(self) -> None:
+        self._aggregator.clear_events()
+        self._aggregator.reset_workers()
+
+    def __repr__(self) -> str:
+        return f"<LMCacheKVEvents events={self.get_all_events()}>"
+
+
+class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
+    @classmethod
+    def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
+        """
+        LMCache requires PIECEWISE CUDA graph mode when layerwise
+        operations are enabled. The wait_for_layer_load and save_kv_layer
+        methods perform actual async synchronization that cannot be
+        captured in CUDA graphs.
+        """
+        return extra_config.get("use_layerwise", False)
+
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
+        assert vllm_config.kv_transfer_config is not None
+        use_native = vllm_config.kv_transfer_config.get_from_extra_config(
+            "use_native", False
+        )
+        if use_native:
+            logger.info("Initializing native LMCache connector")
+            # lazy import
+            from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration
+
+            _adapter = lmcache_integration.vllm_v1_adapter
+
+            cls = _adapter.LMCacheConnectorV1Impl
+        else:
+            logger.info("Initializing latest dev LMCache connector")
+            # lazy import
+            from lmcache.integration.vllm.vllm_v1_adapter import (
+                LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
+            )
+
+            cls = LMCacheConnectorLatestImpl
+
+        self._lmcache_engine = cls(vllm_config, role, self)
+
+        self._kv_cache_events: LMCacheKVEvents | None = None
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        """
+        Initialize with the KV caches. Useful for pre-registering the
+        KV Caches in the KVConnector (e.g. for NIXL).
+
+        Args:
+            kv_caches: dictionary of layer names, kv cache
+        """
+        if hasattr(self._lmcache_engine, "register_kv_caches"):
+            self._lmcache_engine.register_kv_caches(kv_caches)
+        else:
+            logger.warning(
+                "LMCache engine does not support register_kv_caches, "
+                "please check and use the latest version"
+            )
+
+    def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be
+            the same.
+
+        """
+        self._lmcache_engine.start_load_kv(forward_context, **kwargs)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        self._lmcache_engine.wait_for_layer_load(layer_name)
+
+    def save_kv_layer(
+        self,
+        layer_name: str,
+        kv_layer: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Start saving the a layer of KV cache from vLLM's paged buffer
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        self._lmcache_engine.save_kv_layer(
+            layer_name, kv_layer, attn_metadata, **kwargs
+        )
+
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        self._lmcache_engine.wait_for_save()
+
+    def get_finished(
+        self, finished_req_ids: set[str]
+    ) -> tuple[set[str] | None, set[str] | None]:
+        """
+        Notifies worker-side connector ids of requests that have
+        finished generating tokens.
+
+        Returns:
+            ids of requests that have finished asynchronous transfer
+            (requests that previously returned True from request_finished()),
+            tuple of (sending/saving ids, recving/loading ids).
+            The finished saves/sends req ids must belong to a set provided in a
+            call to this method (this call or a prior one).
+        """
+        return self._lmcache_engine.get_finished(finished_req_ids)
+
+    def get_block_ids_with_load_errors(self) -> set[int]:
+        """
+        Get the set of block IDs that failed to load.
+
+        Returns:
+            Set of block IDs that encountered load errors.
+            Empty set if no load errors occurred.
+        """
+        method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None)
+        if callable(method):
+            return method()
+
+        # Fallback for older versions that don't support this method
+        return set()
+
+    def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
+        """
+        Get the KV connector kv cache events collected during the last interval.
+        """
+
+        events = self._lmcache_engine.get_kv_events()  # type: ignore [attr-defined]
+        if not events:
+            return None
+
+        blocks: list[BlockStored] = [
+            BlockStored(
+                block_hashes=e.block_hashes,
+                parent_block_hash=e.parent_block_hash,
+                token_ids=e.token_ids,
+                lora_id=e.lora_id,
+                block_size=e.block_size,
+                medium=e.medium,
+                lora_name=getattr(e, "lora_name", None),
+            )
+            for e in events
+        ]
+
+        lmcache_kv_events = LMCacheKVEvents(num_workers=1)
+        lmcache_kv_events.add_events(blocks)
+        return lmcache_kv_events
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the
+            external KV cache beyond what is already computed.
+        """
+        return self._lmcache_engine.get_num_new_matched_tokens(
+            request, num_computed_tokens
+        ), False
+
+    def update_state_after_alloc(
+        self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
+    ):
+        """
+        Update KVConnector state after block allocation.
+        """
+        self._lmcache_engine.update_state_after_alloc(request, num_external_tokens)
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        return self._lmcache_engine.build_connector_meta(scheduler_output)
+
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        # Get the KV events
+        kv_cache_events = connector_output.kv_cache_events
+        if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
+            return
+
+        if self._kv_cache_events is None:
+            self._kv_cache_events = kv_cache_events
+        else:
+            self._kv_cache_events.add_events(kv_cache_events.get_all_events())
+            self._kv_cache_events.increment_workers(
+                kv_cache_events.get_number_of_workers()
+            )
+        return
+
+    def request_finished(
+        self,
+        request: "Request",
+        block_ids: list[int],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called when a request has finished, before its blocks are freed.
+
+        Returns:
+            True if the request is being saved/sent asynchronously and blocks
+            should not be freed until the request_id is returned from
+            get_finished().
+            Optional KVTransferParams to be included in the request outputs
+            returned by the engine.
+        """
+        return self._lmcache_engine.request_finished(request, block_ids)
+
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called exactly once when a request has finished for all KV cache
+        groups (HMA support for hybrid Mamba/Attention models).
+
+        LMCache only stores/offloads attention KV cache blocks, so we
+        extract the first group's block IDs and delegate to the
+        single-group request_finished.
+
+        Args:
+            request: the request object.
+            block_ids: tuple of block ID lists, one per KV cache group.
+
+        Returns:
+            Same as request_finished.
+        """
+        # LMCache only handles attention (first) group blocks.
+        # Mamba SSM state is managed separately by the scheduler.
+        return self.request_finished(request, block_ids[0])
+
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        if self._kv_cache_events is not None:
+            self._kv_cache_events.aggregate()
+            kv_cache_events = self._kv_cache_events.get_all_events()
+            yield from kv_cache_events
+            self._kv_cache_events.clear_events()
+            self._kv_cache_events = None