Files
vllm-with-lmcache/THIS_IS_MY_POD_SUMMARY_VINNY.md

10 KiB

apiVersion: v1 kind: Pod metadata: annotations: cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2 cni.projectcalico.org/podIP: 10.244.248.111/32 cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128 k8s.v1.cni.cncf.io/network-status: |- [{ "name": "k8s-pod-network", "ips": [ "10.244.248.111", "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0" ], "default": true, "dns": {} },{ "name": "vllm/ipoib-network-vllm", "interface": "net1", "ips": [ "10.66.0.6" ], "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44", "dns": {} }] k8s.v1.cni.cncf.io/networks: ipoib-network-vllm k8s.v1.cni.cncf.io/networks-status: |- [{ "name": "k8s-pod-network", "ips": [ "10.244.248.111", "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0" ], "default": true, "dns": {} },{ "name": "vllm/ipoib-network-vllm", "interface": "net1", "ips": [ "10.66.0.6" ], "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44", "dns": {} }] creationTimestamp: '2026-04-15T22:38:27Z' generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695- generation: 1 labels: app.kubernetes.io/component: serving-engine app.kubernetes.io/instance: production-stack-sea-inference app.kubernetes.io/managed-by: helm app.kubernetes.io/name: nemotron-3-super app.kubernetes.io/part-of: vllm-stack environment: test helm-release-name: production-stack-sea-inference model: nemotron-3-super pod-template-hash: 856dc7d695 release: test topology.kubernetes.io/region: sea name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl namespace: vllm ownerReferences: - apiVersion: apps/v1 blockOwnerDeletion: true controller: true kind: ReplicaSet name: >- production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695 uid: 88c04723-f29b-432a-8318-21a9d389cac4 resourceVersion: '29767269' uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e spec: containers: - command: - vllm - serve - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 - '--host' - 0.0.0.0 - '--port' - '8000' - '--no-enable-prefix-caching' - '--tensor-parallel-size' - '8' - '--async-scheduling' - '--dtype=auto' - '--attention-backend=TRITON_ATTN' - '--gpu_memory_utilization=0.96' - '--enable-auto-tool-choice' - '--tool-call-parser=qwen3_coder' - '--trust_remote_code' - '--max-cudagraph-capture-size=128' - '--enable-chunked-prefill' - '--mamba-ssm-cache-dtype=float16' - '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py' - '--reasoning-parser=super_v3' - '--max-model-len=1048576' - '--disable-custom-all-reduce' - '--no-disable-hybrid-kv-cache-manager' - '--enforce-eager' - '--kv-transfer-config' - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}' env: - name: PYTHONHASHSEED value: '123' - name: HF_HOME value: /tmp - name: POD_IP valueFrom: fieldRef: apiVersion: v1 fieldPath: status.podIP - name: PROMETHEUS_MULTIPROC_DIR value: /tmp - name: OMP_NUM_THREADS value: '32' - name: HF_TOKEN value: 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 - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN value: '1' - name: NCCL_TOPO_FILE value: /etc/nccl/virtualTopology.xml - name: PYTORCH_CUDA_ALLOC_CONF value: expandable_segments:True - name: LMCACHE_REMOTE_URL value: redis://10.66.0.100:6379 - name: LMCACHE_REMOTE_SERDE value: naive - name: LMCACHE_USE_EXPERIMENTAL value: 'True' - name: VLLM_RPC_TIMEOUT value: '1000000' - name: LMCACHE_LOG_LEVEL value: ERROR - name: LMCACHE_LOCAL_CPU value: 'True' - name: LMCACHE_MAX_LOCAL_CPU_SIZE value: '512' - name: LMCACHE_LMCACHE_INSTANCE_ID valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.name image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130 imagePullPolicy: Always livenessProbe: failureThreshold: 3 httpGet: path: /health port: 8000 scheme: HTTP initialDelaySeconds: 15 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 name: vllm ports: - containerPort: 8000 name: container-port protocol: TCP - containerPort: 55555 name: zmq-port protocol: TCP - containerPort: 9999 name: ucx-port protocol: TCP readinessProbe: failureThreshold: 3 httpGet: path: /health port: 8000 scheme: HTTP initialDelaySeconds: 15 periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 resources: limits: memory: 1500Gi nvidia.com/gpu: '8' rdma/ib: '1' requests: cpu: '8' memory: 16Gi nvidia.com/gpu: '8' rdma/ib: '1' securityContext: runAsNonRoot: false startupProbe: failureThreshold: 120 httpGet: path: /health port: 8000 scheme: HTTP initialDelaySeconds: 30 periodSeconds: 60 successThreshold: 1 timeoutSeconds: 1 terminationMessagePath: /dev/termination-log terminationMessagePolicy: File volumeMounts: - mountPath: /dev/shm name: shm - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: kube-api-access-dlhrd readOnly: true dnsPolicy: ClusterFirst enableServiceLinks: true hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack nodeName: b200-nodepool-d51376abbf32 preemptionPolicy: PreemptLowerPriority priority: 0 restartPolicy: Always schedulerName: default-scheduler securityContext: {} serviceAccount: default serviceAccountName: default subdomain: production-stack-sea-inference-nemotron-3-super-engine-service terminationGracePeriodSeconds: 30 tolerations: - effect: NoExecute key: node.kubernetes.io/not-ready operator: Exists tolerationSeconds: 300 - effect: NoExecute key: node.kubernetes.io/unreachable operator: Exists tolerationSeconds: 300 volumes: - emptyDir: medium: Memory sizeLimit: 64Gi name: shm - name: kube-api-access-dlhrd projected: defaultMode: 420 sources: - serviceAccountToken: expirationSeconds: 3607 path: token - configMap: items: - key: ca.crt path: ca.crt name: kube-root-ca.crt - downwardAPI: items: - fieldRef: apiVersion: v1 fieldPath: metadata.namespace path: namespace status: conditions: - lastProbeTime: null lastTransitionTime: '2026-04-15T22:38:32Z' observedGeneration: 1 status: 'True' type: PodReadyToStartContainers - lastProbeTime: null lastTransitionTime: '2026-04-15T22:38:27Z' observedGeneration: 1 status: 'True' type: Initialized - lastProbeTime: null lastTransitionTime: '2026-04-15T22:38:27Z' message: 'containers with unready status: [vllm]' observedGeneration: 1 reason: ContainersNotReady status: 'False' type: Ready - lastProbeTime: null lastTransitionTime: '2026-04-15T22:38:27Z' message: 'containers with unready status: [vllm]' observedGeneration: 1 reason: ContainersNotReady status: 'False' type: ContainersReady - lastProbeTime: null lastTransitionTime: '2026-04-15T22:38:27Z' observedGeneration: 1 status: 'True' type: PodScheduled containerStatuses: - allocatedResources: cpu: '8' memory: 16Gi nvidia.com/gpu: '8' rdma/ib: '1' containerID: >- containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130 imageID: >- atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59 lastState: terminated: containerID: >- containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a exitCode: 1 finishedAt: '2026-04-15T22:42:20Z' reason: Error startedAt: '2026-04-15T22:38:31Z' name: vllm ready: false resources: limits: memory: 1500Gi nvidia.com/gpu: '8' rdma/ib: '1' requests: cpu: '8' memory: 16Gi nvidia.com/gpu: '8' rdma/ib: '1' restartCount: 1 started: false state: running: startedAt: '2026-04-15T22:42:24Z' volumeMounts: - mountPath: /dev/shm name: shm - mountPath: /var/run/secrets/kubernetes.io/serviceaccount name: kube-api-access-dlhrd readOnly: true recursiveReadOnly: Disabled hostIP: 10.4.96.13 hostIPs: - ip: 10.4.96.13 - ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32 observedGeneration: 1 phase: Running podIP: 10.244.248.111 podIPs: - ip: 10.244.248.111 - ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0 qosClass: Burstable startTime: '2026-04-15T22:38:27Z'