vllm-with-lmcache/THIS_IS_MY_POD_SUMMARY_VINNY.md

apiVersion: v1
kind: Pod
metadata:
  annotations:
    cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
    cni.projectcalico.org/podIP: 10.244.248.111/32
    cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
    k8s.v1.cni.cncf.io/network-status: |-
      [{
          "name": "k8s-pod-network",
          "ips": [
              "10.244.248.111",
              "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
          ],
          "default": true,
          "dns": {}
      },{
          "name": "vllm/ipoib-network-vllm",
          "interface": "net1",
          "ips": [
              "10.66.0.6"
          ],
          "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
          "dns": {}
      }]
    k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
    k8s.v1.cni.cncf.io/networks-status: |-
      [{
          "name": "k8s-pod-network",
          "ips": [
              "10.244.248.111",
              "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
          ],
          "default": true,
          "dns": {}
      },{
          "name": "vllm/ipoib-network-vllm",
          "interface": "net1",
          "ips": [
              "10.66.0.6"
          ],
          "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
          "dns": {}
      }]
  creationTimestamp: '2026-04-15T22:38:27Z'
  generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
  generation: 1
  labels:
    app.kubernetes.io/component: serving-engine
    app.kubernetes.io/instance: production-stack-sea-inference
    app.kubernetes.io/managed-by: helm
    app.kubernetes.io/name: nemotron-3-super
    app.kubernetes.io/part-of: vllm-stack
    environment: test
    helm-release-name: production-stack-sea-inference
    model: nemotron-3-super
    pod-template-hash: 856dc7d695
    release: test
    topology.kubernetes.io/region: sea
  name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
  namespace: vllm
  ownerReferences:
    - apiVersion: apps/v1
      blockOwnerDeletion: true
      controller: true
      kind: ReplicaSet
      name: >-
        production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
      uid: 88c04723-f29b-432a-8318-21a9d389cac4
  resourceVersion: '29767269'
  uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
spec:
  containers:
    - command:
        - vllm
        - serve
        - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
        - '--host'
        - 0.0.0.0
        - '--port'
        - '8000'
        - '--no-enable-prefix-caching'
        - '--tensor-parallel-size'
        - '8'
        - '--async-scheduling'
        - '--dtype=auto'
        - '--attention-backend=TRITON_ATTN'
        - '--gpu_memory_utilization=0.96'
        - '--enable-auto-tool-choice'
        - '--tool-call-parser=qwen3_coder'
        - '--trust_remote_code'
        - '--max-cudagraph-capture-size=128'
        - '--enable-chunked-prefill'
        - '--mamba-ssm-cache-dtype=float16'
        - '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
        - '--reasoning-parser=super_v3'
        - '--max-model-len=1048576'
        - '--disable-custom-all-reduce'
        - '--no-disable-hybrid-kv-cache-manager'
        - '--enforce-eager'
        - '--kv-transfer-config'
        - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
      env:
        - name: PYTHONHASHSEED
          value: '123'
        - name: HF_HOME
          value: /tmp
        - name: POD_IP
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: status.podIP
        - name: PROMETHEUS_MULTIPROC_DIR
          value: /tmp
        - name: OMP_NUM_THREADS
          value: '32'
        - name: HF_TOKEN
          value: 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
        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
          value: '1'
        - name: NCCL_TOPO_FILE
          value: /etc/nccl/virtualTopology.xml
        - name: PYTORCH_CUDA_ALLOC_CONF
          value: expandable_segments:True
        - name: LMCACHE_REMOTE_URL
          value: redis://10.66.0.100:6379
        - name: LMCACHE_REMOTE_SERDE
          value: naive
        - name: LMCACHE_USE_EXPERIMENTAL
          value: 'True'
        - name: VLLM_RPC_TIMEOUT
          value: '1000000'
        - name: LMCACHE_LOG_LEVEL
          value: ERROR
        - name: LMCACHE_LOCAL_CPU
          value: 'True'
        - name: LMCACHE_MAX_LOCAL_CPU_SIZE
          value: '512'
        - name: LMCACHE_LMCACHE_INSTANCE_ID
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
      image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
      imagePullPolicy: Always
      livenessProbe:
        failureThreshold: 3
        httpGet:
          path: /health
          port: 8000
          scheme: HTTP
        initialDelaySeconds: 15
        periodSeconds: 10
        successThreshold: 1
        timeoutSeconds: 1
      name: vllm
      ports:
        - containerPort: 8000
          name: container-port
          protocol: TCP
        - containerPort: 55555
          name: zmq-port
          protocol: TCP
        - containerPort: 9999
          name: ucx-port
          protocol: TCP
      readinessProbe:
        failureThreshold: 3
        httpGet:
          path: /health
          port: 8000
          scheme: HTTP
        initialDelaySeconds: 15
        periodSeconds: 5
        successThreshold: 1
        timeoutSeconds: 1
      resources:
        limits:
          memory: 1500Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
        requests:
          cpu: '8'
          memory: 16Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
      securityContext:
        runAsNonRoot: false
      startupProbe:
        failureThreshold: 120
        httpGet:
          path: /health
          port: 8000
          scheme: HTTP
        initialDelaySeconds: 30
        periodSeconds: 60
        successThreshold: 1
        timeoutSeconds: 1
      terminationMessagePath: /dev/termination-log
      terminationMessagePolicy: File
      volumeMounts:
        - mountPath: /dev/shm
          name: shm
        - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
          name: kube-api-access-dlhrd
          readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
  nodeName: b200-nodepool-d51376abbf32
  preemptionPolicy: PreemptLowerPriority
  priority: 0
  restartPolicy: Always
  schedulerName: default-scheduler
  securityContext: {}
  serviceAccount: default
  serviceAccountName: default
  subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
  terminationGracePeriodSeconds: 30
  tolerations:
    - effect: NoExecute
      key: node.kubernetes.io/not-ready
      operator: Exists
      tolerationSeconds: 300
    - effect: NoExecute
      key: node.kubernetes.io/unreachable
      operator: Exists
      tolerationSeconds: 300
  volumes:
    - emptyDir:
        medium: Memory
        sizeLimit: 64Gi
      name: shm
    - name: kube-api-access-dlhrd
      projected:
        defaultMode: 420
        sources:
          - serviceAccountToken:
              expirationSeconds: 3607
              path: token
          - configMap:
              items:
                - key: ca.crt
                  path: ca.crt
              name: kube-root-ca.crt
          - downwardAPI:
              items:
                - fieldRef:
                    apiVersion: v1
                    fieldPath: metadata.namespace
                  path: namespace
status:
  conditions:
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:32Z'
      observedGeneration: 1
      status: 'True'
      type: PodReadyToStartContainers
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      observedGeneration: 1
      status: 'True'
      type: Initialized
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      message: 'containers with unready status: [vllm]'
      observedGeneration: 1
      reason: ContainersNotReady
      status: 'False'
      type: Ready
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      message: 'containers with unready status: [vllm]'
      observedGeneration: 1
      reason: ContainersNotReady
      status: 'False'
      type: ContainersReady
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      observedGeneration: 1
      status: 'True'
      type: PodScheduled
  containerStatuses:
    - allocatedResources:
        cpu: '8'
        memory: 16Gi
        nvidia.com/gpu: '8'
        rdma/ib: '1'
      containerID: >-
        containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
      image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
      imageID: >-
        atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
      lastState:
        terminated:
          containerID: >-
            containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
          exitCode: 1
          finishedAt: '2026-04-15T22:42:20Z'
          reason: Error
          startedAt: '2026-04-15T22:38:31Z'
      name: vllm
      ready: false
      resources:
        limits:
          memory: 1500Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
        requests:
          cpu: '8'
          memory: 16Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
      restartCount: 1
      started: false
      state:
        running:
          startedAt: '2026-04-15T22:42:24Z'
      volumeMounts:
        - mountPath: /dev/shm
          name: shm
        - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
          name: kube-api-access-dlhrd
          readOnly: true
          recursiveReadOnly: Disabled
  hostIP: 10.4.96.13
  hostIPs:
    - ip: 10.4.96.13
    - ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
  observedGeneration: 1
  phase: Running
  podIP: 10.244.248.111
  podIPs:
    - ip: 10.244.248.111
    - ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
  qosClass: Burstable
  startTime: '2026-04-15T22:38:27Z'