Files
vllm-with-lmcache/THIS_IS_MY_POD_SUMMARY_VINNY.md

338 lines
10 KiB
Markdown

apiVersion: v1
kind: Pod
metadata:
annotations:
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
cni.projectcalico.org/podIP: 10.244.248.111/32
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
k8s.v1.cni.cncf.io/network-status: |-
[{
"name": "k8s-pod-network",
"ips": [
"10.244.248.111",
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
],
"default": true,
"dns": {}
},{
"name": "vllm/ipoib-network-vllm",
"interface": "net1",
"ips": [
"10.66.0.6"
],
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
"dns": {}
}]
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
k8s.v1.cni.cncf.io/networks-status: |-
[{
"name": "k8s-pod-network",
"ips": [
"10.244.248.111",
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
],
"default": true,
"dns": {}
},{
"name": "vllm/ipoib-network-vllm",
"interface": "net1",
"ips": [
"10.66.0.6"
],
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
"dns": {}
}]
creationTimestamp: '2026-04-15T22:38:27Z'
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
generation: 1
labels:
app.kubernetes.io/component: serving-engine
app.kubernetes.io/instance: production-stack-sea-inference
app.kubernetes.io/managed-by: helm
app.kubernetes.io/name: nemotron-3-super
app.kubernetes.io/part-of: vllm-stack
environment: test
helm-release-name: production-stack-sea-inference
model: nemotron-3-super
pod-template-hash: 856dc7d695
release: test
topology.kubernetes.io/region: sea
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
namespace: vllm
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: ReplicaSet
name: >-
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
uid: 88c04723-f29b-432a-8318-21a9d389cac4
resourceVersion: '29767269'
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
spec:
containers:
- command:
- vllm
- serve
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
- '--host'
- 0.0.0.0
- '--port'
- '8000'
- '--no-enable-prefix-caching'
- '--tensor-parallel-size'
- '8'
- '--async-scheduling'
- '--dtype=auto'
- '--attention-backend=TRITON_ATTN'
- '--gpu_memory_utilization=0.96'
- '--enable-auto-tool-choice'
- '--tool-call-parser=qwen3_coder'
- '--trust_remote_code'
- '--max-cudagraph-capture-size=128'
- '--enable-chunked-prefill'
- '--mamba-ssm-cache-dtype=float16'
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
- '--reasoning-parser=super_v3'
- '--max-model-len=1048576'
- '--disable-custom-all-reduce'
- '--no-disable-hybrid-kv-cache-manager'
- '--enforce-eager'
- '--kv-transfer-config'
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
env:
- name: PYTHONHASHSEED
value: '123'
- name: HF_HOME
value: /tmp
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PROMETHEUS_MULTIPROC_DIR
value: /tmp
- name: OMP_NUM_THREADS
value: '32'
- name: HF_TOKEN
value: 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
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: '1'
- name: NCCL_TOPO_FILE
value: /etc/nccl/virtualTopology.xml
- name: PYTORCH_CUDA_ALLOC_CONF
value: expandable_segments:True
- name: LMCACHE_REMOTE_URL
value: redis://10.66.0.100:6379
- name: LMCACHE_REMOTE_SERDE
value: naive
- name: LMCACHE_USE_EXPERIMENTAL
value: 'True'
- name: VLLM_RPC_TIMEOUT
value: '1000000'
- name: LMCACHE_LOG_LEVEL
value: ERROR
- name: LMCACHE_LOCAL_CPU
value: 'True'
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
value: '512'
- name: LMCACHE_LMCACHE_INSTANCE_ID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: vllm
ports:
- containerPort: 8000
name: container-port
protocol: TCP
- containerPort: 55555
name: zmq-port
protocol: TCP
- containerPort: 9999
name: ucx-port
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
memory: 1500Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
requests:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
securityContext:
runAsNonRoot: false
startupProbe:
failureThreshold: 120
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 60
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dlhrd
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
nodeName: b200-nodepool-d51376abbf32
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- emptyDir:
medium: Memory
sizeLimit: 64Gi
name: shm
- name: kube-api-access-dlhrd
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:32Z'
observedGeneration: 1
status: 'True'
type: PodReadyToStartContainers
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
observedGeneration: 1
status: 'True'
type: Initialized
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
message: 'containers with unready status: [vllm]'
observedGeneration: 1
reason: ContainersNotReady
status: 'False'
type: Ready
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
message: 'containers with unready status: [vllm]'
observedGeneration: 1
reason: ContainersNotReady
status: 'False'
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: '2026-04-15T22:38:27Z'
observedGeneration: 1
status: 'True'
type: PodScheduled
containerStatuses:
- allocatedResources:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
containerID: >-
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
imageID: >-
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
lastState:
terminated:
containerID: >-
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
exitCode: 1
finishedAt: '2026-04-15T22:42:20Z'
reason: Error
startedAt: '2026-04-15T22:38:31Z'
name: vllm
ready: false
resources:
limits:
memory: 1500Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
requests:
cpu: '8'
memory: 16Gi
nvidia.com/gpu: '8'
rdma/ib: '1'
restartCount: 1
started: false
state:
running:
startedAt: '2026-04-15T22:42:24Z'
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dlhrd
readOnly: true
recursiveReadOnly: Disabled
hostIP: 10.4.96.13
hostIPs:
- ip: 10.4.96.13
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
observedGeneration: 1
phase: Running
podIP: 10.244.248.111
podIPs:
- ip: 10.244.248.111
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
qosClass: Burstable
startTime: '2026-04-15T22:38:27Z'