From c570c4658e0670d058ea803f8bd52f6c8f244a06 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Wed, 15 Apr 2026 22:55:00 +0000 Subject: [PATCH] monkey patch the monkey pathing vllm nonsense --- Dockerfile | 8 +- THIS_IS_MY_POD_SUMMARY_VINNY.md | 337 ++++++++++++++++++++ THIS_IS_THE_ERROR_VINNY.md | 545 ++++++++++++++++++++++++++++++++ lmcache_connector.py | 379 ++++++++++++++++++++++ 4 files changed, 1268 insertions(+), 1 deletion(-) create mode 100644 THIS_IS_MY_POD_SUMMARY_VINNY.md create mode 100644 THIS_IS_THE_ERROR_VINNY.md create mode 100644 lmcache_connector.py diff --git a/Dockerfile b/Dockerfile index 4ed58f3..d73715e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,4 +20,10 @@ RUN apt-get update && apt-get install -y git \ COPY ./super_v3_reasoning_parser.py /opt/super_v3_reasoning_parser.py # Monkey patch more vllm stuff - https://github.com/vllm-project/vllm/pull/38237/changes#diff-bee6813076031d3ca1edc903c1b02b81e4676519afc562ce3fefe37f20c7b650 -RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py \ No newline at end of file +RUN sed -i "s/if self\.kv_events_config is not None:/if self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events:/" /usr/local/lib/python3.12/dist-packages/vllm/config/vllm.py + +# Patch LMCacheConnectorV1 to support HMA (Hybrid Mamba/Attention KV cache manager) +# This is required for hybrid models like Nemotron that use both Mamba and Attention layers. +# Without this patch, LMCacheConnectorV1 fails with: +# "Connector LMCacheConnectorV1 does not support HMA but HMA is enabled" +COPY ./lmcache_connector.py /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py \ No newline at end of file diff --git a/THIS_IS_MY_POD_SUMMARY_VINNY.md b/THIS_IS_MY_POD_SUMMARY_VINNY.md new file mode 100644 index 0000000..7dc372d --- /dev/null +++ b/THIS_IS_MY_POD_SUMMARY_VINNY.md @@ -0,0 +1,337 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2 + cni.projectcalico.org/podIP: 10.244.248.111/32 + cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128 + k8s.v1.cni.cncf.io/network-status: |- + [{ + "name": "k8s-pod-network", + "ips": [ + "10.244.248.111", + "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0" + ], + "default": true, + "dns": {} + },{ + "name": "vllm/ipoib-network-vllm", + "interface": "net1", + "ips": [ + "10.66.0.6" + ], + "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44", + "dns": {} + }] + k8s.v1.cni.cncf.io/networks: ipoib-network-vllm + k8s.v1.cni.cncf.io/networks-status: |- + [{ + "name": "k8s-pod-network", + "ips": [ + "10.244.248.111", + "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0" + ], + "default": true, + "dns": {} + },{ + "name": "vllm/ipoib-network-vllm", + "interface": "net1", + "ips": [ + "10.66.0.6" + ], + "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44", + "dns": {} + }] + creationTimestamp: '2026-04-15T22:38:27Z' + generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695- + generation: 1 + labels: + app.kubernetes.io/component: serving-engine + app.kubernetes.io/instance: production-stack-sea-inference + app.kubernetes.io/managed-by: helm + app.kubernetes.io/name: nemotron-3-super + app.kubernetes.io/part-of: vllm-stack + environment: test + helm-release-name: production-stack-sea-inference + model: nemotron-3-super + pod-template-hash: 856dc7d695 + release: test + topology.kubernetes.io/region: sea + name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl + namespace: vllm + ownerReferences: + - apiVersion: apps/v1 + blockOwnerDeletion: true + controller: true + kind: ReplicaSet + name: >- + production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695 + uid: 88c04723-f29b-432a-8318-21a9d389cac4 + resourceVersion: '29767269' + uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e +spec: + containers: + - command: + - vllm + - serve + - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 + - '--host' + - 0.0.0.0 + - '--port' + - '8000' + - '--no-enable-prefix-caching' + - '--tensor-parallel-size' + - '8' + - '--async-scheduling' + - '--dtype=auto' + - '--attention-backend=TRITON_ATTN' + - '--gpu_memory_utilization=0.96' + - '--enable-auto-tool-choice' + - '--tool-call-parser=qwen3_coder' + - '--trust_remote_code' + - '--max-cudagraph-capture-size=128' + - '--enable-chunked-prefill' + - '--mamba-ssm-cache-dtype=float16' + - '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py' + - '--reasoning-parser=super_v3' + - '--max-model-len=1048576' + - '--disable-custom-all-reduce' + - '--no-disable-hybrid-kv-cache-manager' + - '--enforce-eager' + - '--kv-transfer-config' + - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' + env: + - name: PYTHONHASHSEED + value: '123' + - name: HF_HOME + value: /tmp + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PROMETHEUS_MULTIPROC_DIR + value: /tmp + - name: OMP_NUM_THREADS + value: '32' + - name: HF_TOKEN + value: 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 + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + - name: NCCL_TOPO_FILE + value: /etc/nccl/virtualTopology.xml + - name: PYTORCH_CUDA_ALLOC_CONF + value: expandable_segments:True + - name: LMCACHE_REMOTE_URL + value: redis://10.66.0.100:6379 + - name: LMCACHE_REMOTE_SERDE + value: naive + - name: LMCACHE_USE_EXPERIMENTAL + value: 'True' + - name: VLLM_RPC_TIMEOUT + value: '1000000' + - name: LMCACHE_LOG_LEVEL + value: ERROR + - name: LMCACHE_LOCAL_CPU + value: 'True' + - name: LMCACHE_MAX_LOCAL_CPU_SIZE + value: '512' + - name: LMCACHE_LMCACHE_INSTANCE_ID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130 + imagePullPolicy: Always + livenessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + name: vllm + ports: + - containerPort: 8000 + name: container-port + protocol: TCP + - containerPort: 55555 + name: zmq-port + protocol: TCP + - containerPort: 9999 + name: ucx-port + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + memory: 1500Gi + nvidia.com/gpu: '8' + rdma/ib: '1' + requests: + cpu: '8' + memory: 16Gi + nvidia.com/gpu: '8' + rdma/ib: '1' + securityContext: + runAsNonRoot: false + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 60 + successThreshold: 1 + timeoutSeconds: 1 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /dev/shm + name: shm + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-dlhrd + readOnly: true + dnsPolicy: ClusterFirst + enableServiceLinks: true + hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack + nodeName: b200-nodepool-d51376abbf32 + preemptionPolicy: PreemptLowerPriority + priority: 0 + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + subdomain: production-stack-sea-inference-nemotron-3-super-engine-service + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - emptyDir: + medium: Memory + sizeLimit: 64Gi + name: shm + - name: kube-api-access-dlhrd + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace +status: + conditions: + - lastProbeTime: null + lastTransitionTime: '2026-04-15T22:38:32Z' + observedGeneration: 1 + status: 'True' + type: PodReadyToStartContainers + - lastProbeTime: null + lastTransitionTime: '2026-04-15T22:38:27Z' + observedGeneration: 1 + status: 'True' + type: Initialized + - lastProbeTime: null + lastTransitionTime: '2026-04-15T22:38:27Z' + message: 'containers with unready status: [vllm]' + observedGeneration: 1 + reason: ContainersNotReady + status: 'False' + type: Ready + - lastProbeTime: null + lastTransitionTime: '2026-04-15T22:38:27Z' + message: 'containers with unready status: [vllm]' + observedGeneration: 1 + reason: ContainersNotReady + status: 'False' + type: ContainersReady + - lastProbeTime: null + lastTransitionTime: '2026-04-15T22:38:27Z' + observedGeneration: 1 + status: 'True' + type: PodScheduled + containerStatuses: + - allocatedResources: + cpu: '8' + memory: 16Gi + nvidia.com/gpu: '8' + rdma/ib: '1' + containerID: >- + containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b + image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130 + imageID: >- + atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59 + lastState: + terminated: + containerID: >- + containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a + exitCode: 1 + finishedAt: '2026-04-15T22:42:20Z' + reason: Error + startedAt: '2026-04-15T22:38:31Z' + name: vllm + ready: false + resources: + limits: + memory: 1500Gi + nvidia.com/gpu: '8' + rdma/ib: '1' + requests: + cpu: '8' + memory: 16Gi + nvidia.com/gpu: '8' + rdma/ib: '1' + restartCount: 1 + started: false + state: + running: + startedAt: '2026-04-15T22:42:24Z' + volumeMounts: + - mountPath: /dev/shm + name: shm + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access-dlhrd + readOnly: true + recursiveReadOnly: Disabled + hostIP: 10.4.96.13 + hostIPs: + - ip: 10.4.96.13 + - ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32 + observedGeneration: 1 + phase: Running + podIP: 10.244.248.111 + podIPs: + - ip: 10.244.248.111 + - ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0 + qosClass: Burstable + startTime: '2026-04-15T22:38:27Z' diff --git a/THIS_IS_THE_ERROR_VINNY.md b/THIS_IS_THE_ERROR_VINNY.md new file mode 100644 index 0000000..a729a8f --- /dev/null +++ b/THIS_IS_THE_ERROR_VINNY.md @@ -0,0 +1,545 @@ +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] █▄█▀ █ █ █ █ model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:299] +(APIServer pid=1) INFO 04-15 22:38:39 [utils.py:233] non-default args: {'model_tag': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'host': '0.0.0.0', 'model': 'nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', 'trust_remote_code': True, 'max_model_len': 1048576, 'enforce_eager': True, 'attention_backend': 'TRITON_ATTN', 'reasoning_parser': 'super_v3', 'reasoning_parser_plugin': '/opt/super_v3_reasoning_parser.py', 'tensor_parallel_size': 8, 'disable_custom_all_reduce': True, 'gpu_memory_utilization': 0.96, 'enable_prefix_caching': False, 'mamba_ssm_cache_dtype': 'float16', 'enable_chunked_prefill': True, 'disable_hybrid_kv_cache_manager': False, 'async_scheduling': True, 'max_cudagraph_capture_size': 128, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='dea40998-1518-4361-a31f-884d3c1c1e74', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False, kv_load_failure_policy='fail')} +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_ADDR +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PORT +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_PROTO +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PORT +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_HTTP_MONITORING +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_PORT_LISTENER_80 +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_SERVICE_HOST +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_80_TCP_PROTO +(APIServer pid=1) WARNING 04-15 22:38:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ROUTER_GATEWAY_PORT_9091_TCP_ADDR +(APIServer pid=1) A new version of the following files was downloaded from https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4: +(APIServer pid=1) - configuration_nemotron_h.py +(APIServer pid=1) . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision. +(APIServer pid=1) INFO 04-15 22:38:46 [model.py:549] Resolved architecture: NemotronHForCausalLM +(APIServer pid=1) WARNING 04-15 22:38:46 [model.py:2176] User-specified max_model_len (1048576) is greater than the derived max_model_len (max_position_embeddings=262144.0 or model_max_length=None in model's config.json). VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme caution. If the model uses relative position encoding (RoPE), positions exceeding derived_max_model_len lead to nan. If the model uses absolute position encoding, positions exceeding derived_max_model_len will cause a CUDA array out-of-bounds error. +(APIServer pid=1) INFO 04-15 22:38:46 [model.py:1678] Using max model len 1048576 +(APIServer pid=1) INFO 04-15 22:38:46 [cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor. +(APIServer pid=1) INFO 04-15 22:38:46 [scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-15 22:38:46 [config.py:281] Setting attention block size to 1056 tokens to ensure that attention page size is >= mamba page size. +(APIServer pid=1) INFO 04-15 22:38:46 [config.py:312] Padding mamba page size by 0.19% to ensure that mamba page size and attention page size are exactly equal. +(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:381] Detected ModelOpt fp8 checkpoint (quant_algo=FP8). Please note that the format is experimental and could change. +(APIServer pid=1) WARNING 04-15 22:38:46 [modelopt.py:998] Detected ModelOpt NVFP4 checkpoint. Please note that the format is experimental and could change in future. +(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:848] Enforce eager set, disabling torch.compile and CUDAGraphs. This is equivalent to setting -cc.mode=none -cc.cudagraph_mode=none +(APIServer pid=1) WARNING 04-15 22:38:46 [vllm.py:859] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored. +(APIServer pid=1) INFO 04-15 22:38:46 [vllm.py:1025] Cudagraph is disabled under eager mode +(APIServer pid=1) INFO 04-15 22:38:51 [compilation.py:290] Enabled custom fusions: norm_quant, act_quant, allreduce_rms +(EngineCore pid=277) INFO 04-15 22:38:58 [core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', speculative_config=None, tokenizer='nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1048576, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=True, quantization=modelopt_mixed, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=fp8_e4m3, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='super_v3', reasoning_parser_plugin='/opt/super_v3_reasoning_parser.py', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': False, 'alignment_asserts': False, 'scalar_asserts': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=277) INFO 04-15 22:38:58 [multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.244.248.111 (local), world_size=8, local_world_size=8 +(Worker pid=348) INFO 04-15 22:39:03 [parallel_state.py:1400] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=415) INFO 04-15 22:39:07 [parallel_state.py:1400] world_size=8 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=483) INFO 04-15 22:39:11 [parallel_state.py:1400] world_size=8 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=556) INFO 04-15 22:39:15 [parallel_state.py:1400] world_size=8 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=629) INFO 04-15 22:39:19 [parallel_state.py:1400] world_size=8 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=702) INFO 04-15 22:39:23 [parallel_state.py:1400] world_size=8 rank=5 local_rank=5 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=775) INFO 04-15 22:39:27 [parallel_state.py:1400] world_size=8 rank=6 local_rank=6 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=848) INFO 04-15 22:39:31 [parallel_state.py:1400] world_size=8 rank=7 local_rank=7 distributed_init_method=tcp://127.0.0.1:36625 backend=nccl +(Worker pid=348) INFO 04-15 22:39:31 [pynccl.py:111] vLLM is using nccl==2.28.9 +(Worker pid=348) INFO 04-15 22:39:36 [parallel_state.py:1716] rank 0 in world size 8 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker_TP0 pid=348) INFO 04-15 22:39:37 [gpu_model_runner.py:4735] Starting to load model nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4... +(Worker_TP0 pid=348) INFO 04-15 22:39:38 [__init__.py:261] Selected FlashInferFP8ScaledMMLinearKernel for ModelOptFp8LinearMethod +(Worker_TP0 pid=348) INFO 04-15 22:39:38 [deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4_utils.py:85] Using NvFp4LinearBackend.FLASHINFER_CUTLASS for NVFP4 GEMM +(Worker_TP0 pid=348) INFO 04-15 22:39:38 [nvfp4.py:256] Using 'FLASHINFER_TRTLLM' NvFp4 MoE backend out of potential backends: ['FLASHINFER_TRTLLM', 'FLASHINFER_CUTEDSL', 'FLASHINFER_CUTLASS', 'VLLM_CUTLASS', 'MARLIN']. +(Worker_TP1 pid=415) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP0 pid=348) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP4 pid=629) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP2 pid=483) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP6 pid=775) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP7 pid=848) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP3 pid=556) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP5 pid=702) INFO 04-15 22:39:38 [cuda.py:274] Using AttentionBackendEnum.TRITON_ATTN backend. +(Worker_TP1 pid=415) INFO 04-15 22:41:36 [weight_utils.py:581] Time spent downloading weights for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4: 116.048954 seconds +(Worker_TP0 pid=348) +Loading safetensors checkpoint shards: 0% Completed | 0/17 [00:00 +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 8 leaked shared_memory objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' \ No newline at end of file diff --git a/lmcache_connector.py b/lmcache_connector.py new file mode 100644 index 0000000..8161c61 --- /dev/null +++ b/lmcache_connector.py @@ -0,0 +1,379 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any + +import torch + +from vllm.config import VllmConfig +from vllm.distributed.kv_events import ( + BlockStored, + KVCacheEvent, + KVConnectorKVEvents, + KVEventAggregator, +) +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, + SupportsHMA, +) +from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionMetadata +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput + +if TYPE_CHECKING: + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class LMCacheKVEvents(KVConnectorKVEvents): + """ + Concrete implementation of KVConnectorKVEvents using KVEventAggregator. + """ + + def __init__(self, num_workers: int) -> None: + self._aggregator = KVEventAggregator(num_workers) + + def add_events(self, events: list[KVCacheEvent]) -> None: + self._aggregator.add_events(events) + + def aggregate(self) -> "LMCacheKVEvents": + """ + Aggregate KV events and retain only common events. + """ + common_events = self._aggregator.get_common_events() + self._aggregator.clear_events() + self._aggregator.add_events(common_events) + self._aggregator.reset_workers() + return self + + def increment_workers(self, count: int = 1) -> None: + self._aggregator.increment_workers(count) + + def get_all_events(self) -> list[KVCacheEvent]: + return self._aggregator.get_all_events() + + def get_number_of_workers(self) -> int: + return self._aggregator.get_number_of_workers() + + def clear_events(self) -> None: + self._aggregator.clear_events() + self._aggregator.reset_workers() + + def __repr__(self) -> str: + return f"" + + +class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA): + @classmethod + def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool: + """ + LMCache requires PIECEWISE CUDA graph mode when layerwise + operations are enabled. The wait_for_layer_load and save_kv_layer + methods perform actual async synchronization that cannot be + captured in CUDA graphs. + """ + return extra_config.get("use_layerwise", False) + + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) + assert vllm_config.kv_transfer_config is not None + use_native = vllm_config.kv_transfer_config.get_from_extra_config( + "use_native", False + ) + if use_native: + logger.info("Initializing native LMCache connector") + # lazy import + from vllm.distributed.kv_transfer.kv_connector.v1 import lmcache_integration + + _adapter = lmcache_integration.vllm_v1_adapter + + cls = _adapter.LMCacheConnectorV1Impl + else: + logger.info("Initializing latest dev LMCache connector") + # lazy import + from lmcache.integration.vllm.vllm_v1_adapter import ( + LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, + ) + + cls = LMCacheConnectorLatestImpl + + self._lmcache_engine = cls(vllm_config, role, self) + + self._kv_cache_events: LMCacheKVEvents | None = None + + # ============================== + # Worker-side methods + # ============================== + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """ + Initialize with the KV caches. Useful for pre-registering the + KV Caches in the KVConnector (e.g. for NIXL). + + Args: + kv_caches: dictionary of layer names, kv cache + """ + if hasattr(self._lmcache_engine, "register_kv_caches"): + self._lmcache_engine.register_kv_caches(kv_caches) + else: + logger.warning( + "LMCache engine does not support register_kv_caches, " + "please check and use the latest version" + ) + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> None: + """ + Start loading the KV cache from the connector to vLLM's paged + KV buffer. This is called from the forward context before the + forward pass to enable async loading during model execution. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + + """ + self._lmcache_engine.start_load_kv(forward_context, **kwargs) + + def wait_for_layer_load(self, layer_name: str) -> None: + """ + Block until the KV for a specific layer is loaded into vLLM's + paged buffer. This is called from within attention layer to ensure + async copying from start_load_kv is complete. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + self._lmcache_engine.wait_for_layer_load(layer_name) + + def save_kv_layer( + self, + layer_name: str, + kv_layer: torch.Tensor, + attn_metadata: AttentionMetadata, + **kwargs: Any, + ) -> None: + """ + Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. This is called from within attention layer to + enable async copying during execution. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + self._lmcache_engine.save_kv_layer( + layer_name, kv_layer, attn_metadata, **kwargs + ) + + def wait_for_save(self): + """ + Block until all the save operations is done. This is called + as the forward context exits to ensure that the async saving + from save_kv_layer is complete before finishing the forward. + + This prevents overwrites of paged KV buffer before saving done. + """ + self._lmcache_engine.wait_for_save() + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer + (requests that previously returned True from request_finished()), + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + return self._lmcache_engine.get_finished(finished_req_ids) + + def get_block_ids_with_load_errors(self) -> set[int]: + """ + Get the set of block IDs that failed to load. + + Returns: + Set of block IDs that encountered load errors. + Empty set if no load errors occurred. + """ + method = getattr(self._lmcache_engine, "get_block_ids_with_load_errors", None) + if callable(method): + return method() + + # Fallback for older versions that don't support this method + return set() + + def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None: + """ + Get the KV connector kv cache events collected during the last interval. + """ + + events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined] + if not events: + return None + + blocks: list[BlockStored] = [ + BlockStored( + block_hashes=e.block_hashes, + parent_block_hash=e.parent_block_hash, + token_ids=e.token_ids, + lora_id=e.lora_id, + block_size=e.block_size, + medium=e.medium, + lora_name=getattr(e, "lora_name", None), + ) + for e in events + ] + + lmcache_kv_events = LMCacheKVEvents(num_workers=1) + lmcache_kv_events.add_events(blocks) + return lmcache_kv_events + + # ============================== + # Scheduler-side methods + # ============================== + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + """ + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + + Returns: + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + """ + return self._lmcache_engine.get_num_new_matched_tokens( + request, num_computed_tokens + ), False + + def update_state_after_alloc( + self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int + ): + """ + Update KVConnector state after block allocation. + """ + self._lmcache_engine.update_state_after_alloc(request, num_external_tokens) + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + """ + Build the connector metadata for this step. + + This function should NOT modify fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + return self._lmcache_engine.build_connector_meta(scheduler_output) + + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + # Get the KV events + kv_cache_events = connector_output.kv_cache_events + if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents): + return + + if self._kv_cache_events is None: + self._kv_cache_events = kv_cache_events + else: + self._kv_cache_events.add_events(kv_cache_events.get_all_events()) + self._kv_cache_events.increment_workers( + kv_cache_events.get_number_of_workers() + ) + return + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + return self._lmcache_engine.request_finished(request, block_ids) + + def request_finished_all_groups( + self, + request: "Request", + block_ids: tuple[list[int], ...], + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called exactly once when a request has finished for all KV cache + groups (HMA support for hybrid Mamba/Attention models). + + LMCache only stores/offloads attention KV cache blocks, so we + extract the first group's block IDs and delegate to the + single-group request_finished. + + Args: + request: the request object. + block_ids: tuple of block ID lists, one per KV cache group. + + Returns: + Same as request_finished. + """ + # LMCache only handles attention (first) group blocks. + # Mamba SSM state is managed separately by the scheduler. + return self.request_finished(request, block_ids[0]) + + def take_events(self) -> Iterable["KVCacheEvent"]: + """ + Take the KV cache events from the connector. + + Yields: + New KV cache events since the last call. + """ + if self._kv_cache_events is not None: + self._kv_cache_events.aggregate() + kv_cache_events = self._kv_cache_events.get_all_events() + yield from kv_cache_events + self._kv_cache_events.clear_events() + self._kv_cache_events = None