THIS_IS_MY_POD_SUMMARY_VINNY.md

apiVersion: v1
kind: Pod
metadata:
  annotations:
    cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
    cni.projectcalico.org/podIP: 10.244.248.111/32
    cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
    k8s.v1.cni.cncf.io/network-status: |-
      [{
          "name": "k8s-pod-network",
          "ips": [
              "10.244.248.111",
              "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
          ],
          "default": true,
          "dns": {}
      },{
          "name": "vllm/ipoib-network-vllm",
          "interface": "net1",
          "ips": [
              "10.66.0.6"
          ],
          "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
          "dns": {}
      }]
    k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
    k8s.v1.cni.cncf.io/networks-status: |-
      [{
          "name": "k8s-pod-network",
          "ips": [
              "10.244.248.111",
              "fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
          ],
          "default": true,
          "dns": {}
      },{
          "name": "vllm/ipoib-network-vllm",
          "interface": "net1",
          "ips": [
              "10.66.0.6"
          ],
          "mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
          "dns": {}
      }]
  creationTimestamp: '2026-04-15T22:38:27Z'
  generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
  generation: 1
  labels:
    app.kubernetes.io/component: serving-engine
    app.kubernetes.io/instance: production-stack-sea-inference
    app.kubernetes.io/managed-by: helm
    app.kubernetes.io/name: nemotron-3-super
    app.kubernetes.io/part-of: vllm-stack
    environment: test
    helm-release-name: production-stack-sea-inference
    model: nemotron-3-super
    pod-template-hash: 856dc7d695
    release: test
    topology.kubernetes.io/region: sea
  name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
  namespace: vllm
  ownerReferences:
    - apiVersion: apps/v1
      blockOwnerDeletion: true
      controller: true
      kind: ReplicaSet
      name: >-
        production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
      uid: 88c04723-f29b-432a-8318-21a9d389cac4
  resourceVersion: '29767269'
  uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
spec:
  containers:
    - command:
        - vllm
        - serve
        - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
        - '--host'
        - 0.0.0.0
        - '--port'
        - '8000'
        - '--no-enable-prefix-caching'
        - '--tensor-parallel-size'
        - '8'
        - '--async-scheduling'
        - '--dtype=auto'
        - '--attention-backend=TRITON_ATTN'
        - '--gpu_memory_utilization=0.96'
        - '--enable-auto-tool-choice'
        - '--tool-call-parser=qwen3_coder'
        - '--trust_remote_code'
        - '--max-cudagraph-capture-size=128'
        - '--enable-chunked-prefill'
        - '--mamba-ssm-cache-dtype=float16'
        - '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
        - '--reasoning-parser=super_v3'
        - '--max-model-len=1048576'
        - '--disable-custom-all-reduce'
        - '--no-disable-hybrid-kv-cache-manager'
        - '--enforce-eager'
        - '--kv-transfer-config'
        - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
      env:
        - name: PYTHONHASHSEED
          value: '123'
        - name: HF_HOME
          value: /tmp
        - name: POD_IP
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: status.podIP
        - name: PROMETHEUS_MULTIPROC_DIR
          value: /tmp
        - name: OMP_NUM_THREADS
          value: '32'
        - name: HF_TOKEN
          value: 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
        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
          value: '1'
        - name: NCCL_TOPO_FILE
          value: /etc/nccl/virtualTopology.xml
        - name: PYTORCH_CUDA_ALLOC_CONF
          value: expandable_segments:True
        - name: LMCACHE_REMOTE_URL
          value: redis://10.66.0.100:6379
        - name: LMCACHE_REMOTE_SERDE
          value: naive
        - name: LMCACHE_USE_EXPERIMENTAL
          value: 'True'
        - name: VLLM_RPC_TIMEOUT
          value: '1000000'
        - name: LMCACHE_LOG_LEVEL
          value: ERROR
        - name: LMCACHE_LOCAL_CPU
          value: 'True'
        - name: LMCACHE_MAX_LOCAL_CPU_SIZE
          value: '512'
        - name: LMCACHE_LMCACHE_INSTANCE_ID
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
      image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
      imagePullPolicy: Always
      livenessProbe:
        failureThreshold: 3
        httpGet:
          path: /health
          port: 8000
          scheme: HTTP
        initialDelaySeconds: 15
        periodSeconds: 10
        successThreshold: 1
        timeoutSeconds: 1
      name: vllm
      ports:
        - containerPort: 8000
          name: container-port
          protocol: TCP
        - containerPort: 55555
          name: zmq-port
          protocol: TCP
        - containerPort: 9999
          name: ucx-port
          protocol: TCP
      readinessProbe:
        failureThreshold: 3
        httpGet:
          path: /health
          port: 8000
          scheme: HTTP
        initialDelaySeconds: 15
        periodSeconds: 5
        successThreshold: 1
        timeoutSeconds: 1
      resources:
        limits:
          memory: 1500Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
        requests:
          cpu: '8'
          memory: 16Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
      securityContext:
        runAsNonRoot: false
      startupProbe:
        failureThreshold: 120
        httpGet:
          path: /health
          port: 8000
          scheme: HTTP
        initialDelaySeconds: 30
        periodSeconds: 60
        successThreshold: 1
        timeoutSeconds: 1
      terminationMessagePath: /dev/termination-log
      terminationMessagePolicy: File
      volumeMounts:
        - mountPath: /dev/shm
          name: shm
        - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
          name: kube-api-access-dlhrd
          readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
  nodeName: b200-nodepool-d51376abbf32
  preemptionPolicy: PreemptLowerPriority
  priority: 0
  restartPolicy: Always
  schedulerName: default-scheduler
  securityContext: {}
  serviceAccount: default
  serviceAccountName: default
  subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
  terminationGracePeriodSeconds: 30
  tolerations:
    - effect: NoExecute
      key: node.kubernetes.io/not-ready
      operator: Exists
      tolerationSeconds: 300
    - effect: NoExecute
      key: node.kubernetes.io/unreachable
      operator: Exists
      tolerationSeconds: 300
  volumes:
    - emptyDir:
        medium: Memory
        sizeLimit: 64Gi
      name: shm
    - name: kube-api-access-dlhrd
      projected:
        defaultMode: 420
        sources:
          - serviceAccountToken:
              expirationSeconds: 3607
              path: token
          - configMap:
              items:
                - key: ca.crt
                  path: ca.crt
              name: kube-root-ca.crt
          - downwardAPI:
              items:
                - fieldRef:
                    apiVersion: v1
                    fieldPath: metadata.namespace
                  path: namespace
status:
  conditions:
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:32Z'
      observedGeneration: 1
      status: 'True'
      type: PodReadyToStartContainers
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      observedGeneration: 1
      status: 'True'
      type: Initialized
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      message: 'containers with unready status: [vllm]'
      observedGeneration: 1
      reason: ContainersNotReady
      status: 'False'
      type: Ready
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      message: 'containers with unready status: [vllm]'
      observedGeneration: 1
      reason: ContainersNotReady
      status: 'False'
      type: ContainersReady
    - lastProbeTime: null
      lastTransitionTime: '2026-04-15T22:38:27Z'
      observedGeneration: 1
      status: 'True'
      type: PodScheduled
  containerStatuses:
    - allocatedResources:
        cpu: '8'
        memory: 16Gi
        nvidia.com/gpu: '8'
        rdma/ib: '1'
      containerID: >-
        containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
      image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
      imageID: >-
        atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
      lastState:
        terminated:
          containerID: >-
            containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
          exitCode: 1
          finishedAt: '2026-04-15T22:42:20Z'
          reason: Error
          startedAt: '2026-04-15T22:38:31Z'
      name: vllm
      ready: false
      resources:
        limits:
          memory: 1500Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
        requests:
          cpu: '8'
          memory: 16Gi
          nvidia.com/gpu: '8'
          rdma/ib: '1'
      restartCount: 1
      started: false
      state:
        running:
          startedAt: '2026-04-15T22:42:24Z'
      volumeMounts:
        - mountPath: /dev/shm
          name: shm
        - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
          name: kube-api-access-dlhrd
          readOnly: true
          recursiveReadOnly: Disabled
  hostIP: 10.4.96.13
  hostIPs:
    - ip: 10.4.96.13
    - ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
  observedGeneration: 1
  phase: Running
  podIP: 10.244.248.111
  podIPs:
    - ip: 10.244.248.111
    - ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
  qosClass: Burstable
  startTime: '2026-04-15T22:38:27Z'
monkey patch the monkey pathing vllm nonsense 2026-04-15 22:55:00 +00:00			`apiVersion: v1`
			`kind: Pod`
			`metadata:`
			`annotations:`
			`cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2`
			`cni.projectcalico.org/podIP: 10.244.248.111/32`
			`cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128`
			`k8s.v1.cni.cncf.io/network-status: \|-`
			`[{`
			`"name": "k8s-pod-network",`
			`"ips": [`
			`"10.244.248.111",`
			`"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"`
			`],`
			`"default": true,`
			`"dns": {}`
			`},{`
			`"name": "vllm/ipoib-network-vllm",`
			`"interface": "net1",`
			`"ips": [`
			`"10.66.0.6"`
			`],`
			`"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",`
			`"dns": {}`
			`}]`
			`k8s.v1.cni.cncf.io/networks: ipoib-network-vllm`
			`k8s.v1.cni.cncf.io/networks-status: \|-`
			`[{`
			`"name": "k8s-pod-network",`
			`"ips": [`
			`"10.244.248.111",`
			`"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"`
			`],`
			`"default": true,`
			`"dns": {}`
			`},{`
			`"name": "vllm/ipoib-network-vllm",`
			`"interface": "net1",`
			`"ips": [`
			`"10.66.0.6"`
			`],`
			`"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",`
			`"dns": {}`
			`}]`
			`creationTimestamp: '2026-04-15T22:38:27Z'`
			`generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-`
			`generation: 1`
			`labels:`
			`app.kubernetes.io/component: serving-engine`
			`app.kubernetes.io/instance: production-stack-sea-inference`
			`app.kubernetes.io/managed-by: helm`
			`app.kubernetes.io/name: nemotron-3-super`
			`app.kubernetes.io/part-of: vllm-stack`
			`environment: test`
			`helm-release-name: production-stack-sea-inference`
			`model: nemotron-3-super`
			`pod-template-hash: 856dc7d695`
			`release: test`
			`topology.kubernetes.io/region: sea`
			`name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl`
			`namespace: vllm`
			`ownerReferences:`
			`- apiVersion: apps/v1`
			`blockOwnerDeletion: true`
			`controller: true`
			`kind: ReplicaSet`
			`name: >-`
			`production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695`
			`uid: 88c04723-f29b-432a-8318-21a9d389cac4`
			`resourceVersion: '29767269'`
			`uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e`
			`spec:`
			`containers:`
			`- command:`
			`- vllm`
			`- serve`
			`- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4`
			`- '--host'`
			`- 0.0.0.0`
			`- '--port'`
			`- '8000'`
			`- '--no-enable-prefix-caching'`
			`- '--tensor-parallel-size'`
			`- '8'`
			`- '--async-scheduling'`
			`- '--dtype=auto'`
			`- '--attention-backend=TRITON_ATTN'`
			`- '--gpu_memory_utilization=0.96'`
			`- '--enable-auto-tool-choice'`
			`- '--tool-call-parser=qwen3_coder'`
			`- '--trust_remote_code'`
			`- '--max-cudagraph-capture-size=128'`
			`- '--enable-chunked-prefill'`
			`- '--mamba-ssm-cache-dtype=float16'`
			`- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'`
			`- '--reasoning-parser=super_v3'`
			`- '--max-model-len=1048576'`
			`- '--disable-custom-all-reduce'`
			`- '--no-disable-hybrid-kv-cache-manager'`
			`- '--enforce-eager'`
			`- '--kv-transfer-config'`
			`- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'`
			`env:`
			`- name: PYTHONHASHSEED`
			`value: '123'`
			`- name: HF_HOME`
			`value: /tmp`
			`- name: POD_IP`
			`valueFrom:`
			`fieldRef:`
			`apiVersion: v1`
			`fieldPath: status.podIP`
			`- name: PROMETHEUS_MULTIPROC_DIR`
			`value: /tmp`
			`- name: OMP_NUM_THREADS`
			`value: '32'`
			`- name: HF_TOKEN`
			`value: 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`
			`- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN`
			`value: '1'`
			`- name: NCCL_TOPO_FILE`
			`value: /etc/nccl/virtualTopology.xml`
			`- name: PYTORCH_CUDA_ALLOC_CONF`
			`value: expandable_segments:True`
			`- name: LMCACHE_REMOTE_URL`
			`value: redis://10.66.0.100:6379`
			`- name: LMCACHE_REMOTE_SERDE`
			`value: naive`
			`- name: LMCACHE_USE_EXPERIMENTAL`
			`value: 'True'`
			`- name: VLLM_RPC_TIMEOUT`
			`value: '1000000'`
			`- name: LMCACHE_LOG_LEVEL`
			`value: ERROR`
			`- name: LMCACHE_LOCAL_CPU`
			`value: 'True'`
			`- name: LMCACHE_MAX_LOCAL_CPU_SIZE`
			`value: '512'`
			`- name: LMCACHE_LMCACHE_INSTANCE_ID`
			`valueFrom:`
			`fieldRef:`
			`apiVersion: v1`
			`fieldPath: metadata.name`
			`image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130`
			`imagePullPolicy: Always`
			`livenessProbe:`
			`failureThreshold: 3`
			`httpGet:`
			`path: /health`
			`port: 8000`
			`scheme: HTTP`
			`initialDelaySeconds: 15`
			`periodSeconds: 10`
			`successThreshold: 1`
			`timeoutSeconds: 1`
			`name: vllm`
			`ports:`
			`- containerPort: 8000`
			`name: container-port`
			`protocol: TCP`
			`- containerPort: 55555`
			`name: zmq-port`
			`protocol: TCP`
			`- containerPort: 9999`
			`name: ucx-port`
			`protocol: TCP`
			`readinessProbe:`
			`failureThreshold: 3`
			`httpGet:`
			`path: /health`
			`port: 8000`
			`scheme: HTTP`
			`initialDelaySeconds: 15`
			`periodSeconds: 5`
			`successThreshold: 1`
			`timeoutSeconds: 1`
			`resources:`
			`limits:`
			`memory: 1500Gi`
			`nvidia.com/gpu: '8'`
			`rdma/ib: '1'`
			`requests:`
			`cpu: '8'`
			`memory: 16Gi`
			`nvidia.com/gpu: '8'`
			`rdma/ib: '1'`
			`securityContext:`
			`runAsNonRoot: false`
			`startupProbe:`
			`failureThreshold: 120`
			`httpGet:`
			`path: /health`
			`port: 8000`
			`scheme: HTTP`
			`initialDelaySeconds: 30`
			`periodSeconds: 60`
			`successThreshold: 1`
			`timeoutSeconds: 1`
			`terminationMessagePath: /dev/termination-log`
			`terminationMessagePolicy: File`
			`volumeMounts:`
			`- mountPath: /dev/shm`
			`name: shm`
			`- mountPath: /var/run/secrets/kubernetes.io/serviceaccount`
			`name: kube-api-access-dlhrd`
			`readOnly: true`
			`dnsPolicy: ClusterFirst`
			`enableServiceLinks: true`
			`hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack`
			`nodeName: b200-nodepool-d51376abbf32`
			`preemptionPolicy: PreemptLowerPriority`
			`priority: 0`
			`restartPolicy: Always`
			`schedulerName: default-scheduler`
			`securityContext: {}`
			`serviceAccount: default`
			`serviceAccountName: default`
			`subdomain: production-stack-sea-inference-nemotron-3-super-engine-service`
			`terminationGracePeriodSeconds: 30`
			`tolerations:`
			`- effect: NoExecute`
			`key: node.kubernetes.io/not-ready`
			`operator: Exists`
			`tolerationSeconds: 300`
			`- effect: NoExecute`
			`key: node.kubernetes.io/unreachable`
			`operator: Exists`
			`tolerationSeconds: 300`
			`volumes:`
			`- emptyDir:`
			`medium: Memory`
			`sizeLimit: 64Gi`
			`name: shm`
			`- name: kube-api-access-dlhrd`
			`projected:`
			`defaultMode: 420`
			`sources:`
			`- serviceAccountToken:`
			`expirationSeconds: 3607`
			`path: token`
			`- configMap:`
			`items:`
			`- key: ca.crt`
			`path: ca.crt`
			`name: kube-root-ca.crt`
			`- downwardAPI:`
			`items:`
			`- fieldRef:`
			`apiVersion: v1`
			`fieldPath: metadata.namespace`
			`path: namespace`
			`status:`
			`conditions:`
			`- lastProbeTime: null`
			`lastTransitionTime: '2026-04-15T22:38:32Z'`
			`observedGeneration: 1`
			`status: 'True'`
			`type: PodReadyToStartContainers`
			`- lastProbeTime: null`
			`lastTransitionTime: '2026-04-15T22:38:27Z'`
			`observedGeneration: 1`
			`status: 'True'`
			`type: Initialized`
			`- lastProbeTime: null`
			`lastTransitionTime: '2026-04-15T22:38:27Z'`
			`message: 'containers with unready status: [vllm]'`
			`observedGeneration: 1`
			`reason: ContainersNotReady`
			`status: 'False'`
			`type: Ready`
			`- lastProbeTime: null`
			`lastTransitionTime: '2026-04-15T22:38:27Z'`
			`message: 'containers with unready status: [vllm]'`
			`observedGeneration: 1`
			`reason: ContainersNotReady`
			`status: 'False'`
			`type: ContainersReady`
			`- lastProbeTime: null`
			`lastTransitionTime: '2026-04-15T22:38:27Z'`
			`observedGeneration: 1`
			`status: 'True'`
			`type: PodScheduled`
			`containerStatuses:`
			`- allocatedResources:`
			`cpu: '8'`
			`memory: 16Gi`
			`nvidia.com/gpu: '8'`
			`rdma/ib: '1'`
			`containerID: >-`
			`containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b`
			`image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130`
			`imageID: >-`
			`atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59`
			`lastState:`
			`terminated:`
			`containerID: >-`
			`containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a`
			`exitCode: 1`
			`finishedAt: '2026-04-15T22:42:20Z'`
			`reason: Error`
			`startedAt: '2026-04-15T22:38:31Z'`
			`name: vllm`
			`ready: false`
			`resources:`
			`limits:`
			`memory: 1500Gi`
			`nvidia.com/gpu: '8'`
			`rdma/ib: '1'`
			`requests:`
			`cpu: '8'`
			`memory: 16Gi`
			`nvidia.com/gpu: '8'`
			`rdma/ib: '1'`
			`restartCount: 1`
			`started: false`
			`state:`
			`running:`
			`startedAt: '2026-04-15T22:42:24Z'`
			`volumeMounts:`
			`- mountPath: /dev/shm`
			`name: shm`
			`- mountPath: /var/run/secrets/kubernetes.io/serviceaccount`
			`name: kube-api-access-dlhrd`
			`readOnly: true`
			`recursiveReadOnly: Disabled`
			`hostIP: 10.4.96.13`
			`hostIPs:`
			`- ip: 10.4.96.13`
			`- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32`
			`observedGeneration: 1`
			`phase: Running`
			`podIP: 10.244.248.111`
			`podIPs:`
			`- ip: 10.244.248.111`
			`- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0`
			`qosClass: Burstable`
			`startTime: '2026-04-15T22:38:27Z'`