monkey patch the monkey pathing vllm nonsense
This commit is contained in:
337
THIS_IS_MY_POD_SUMMARY_VINNY.md
Normal file
337
THIS_IS_MY_POD_SUMMARY_VINNY.md
Normal file
@@ -0,0 +1,337 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
annotations:
|
||||
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
|
||||
cni.projectcalico.org/podIP: 10.244.248.111/32
|
||||
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
|
||||
k8s.v1.cni.cncf.io/network-status: |-
|
||||
[{
|
||||
"name": "k8s-pod-network",
|
||||
"ips": [
|
||||
"10.244.248.111",
|
||||
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||||
],
|
||||
"default": true,
|
||||
"dns": {}
|
||||
},{
|
||||
"name": "vllm/ipoib-network-vllm",
|
||||
"interface": "net1",
|
||||
"ips": [
|
||||
"10.66.0.6"
|
||||
],
|
||||
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||||
"dns": {}
|
||||
}]
|
||||
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
|
||||
k8s.v1.cni.cncf.io/networks-status: |-
|
||||
[{
|
||||
"name": "k8s-pod-network",
|
||||
"ips": [
|
||||
"10.244.248.111",
|
||||
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||||
],
|
||||
"default": true,
|
||||
"dns": {}
|
||||
},{
|
||||
"name": "vllm/ipoib-network-vllm",
|
||||
"interface": "net1",
|
||||
"ips": [
|
||||
"10.66.0.6"
|
||||
],
|
||||
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||||
"dns": {}
|
||||
}]
|
||||
creationTimestamp: '2026-04-15T22:38:27Z'
|
||||
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
|
||||
generation: 1
|
||||
labels:
|
||||
app.kubernetes.io/component: serving-engine
|
||||
app.kubernetes.io/instance: production-stack-sea-inference
|
||||
app.kubernetes.io/managed-by: helm
|
||||
app.kubernetes.io/name: nemotron-3-super
|
||||
app.kubernetes.io/part-of: vllm-stack
|
||||
environment: test
|
||||
helm-release-name: production-stack-sea-inference
|
||||
model: nemotron-3-super
|
||||
pod-template-hash: 856dc7d695
|
||||
release: test
|
||||
topology.kubernetes.io/region: sea
|
||||
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
|
||||
namespace: vllm
|
||||
ownerReferences:
|
||||
- apiVersion: apps/v1
|
||||
blockOwnerDeletion: true
|
||||
controller: true
|
||||
kind: ReplicaSet
|
||||
name: >-
|
||||
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
|
||||
uid: 88c04723-f29b-432a-8318-21a9d389cac4
|
||||
resourceVersion: '29767269'
|
||||
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
|
||||
spec:
|
||||
containers:
|
||||
- command:
|
||||
- vllm
|
||||
- serve
|
||||
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||||
- '--host'
|
||||
- 0.0.0.0
|
||||
- '--port'
|
||||
- '8000'
|
||||
- '--no-enable-prefix-caching'
|
||||
- '--tensor-parallel-size'
|
||||
- '8'
|
||||
- '--async-scheduling'
|
||||
- '--dtype=auto'
|
||||
- '--attention-backend=TRITON_ATTN'
|
||||
- '--gpu_memory_utilization=0.96'
|
||||
- '--enable-auto-tool-choice'
|
||||
- '--tool-call-parser=qwen3_coder'
|
||||
- '--trust_remote_code'
|
||||
- '--max-cudagraph-capture-size=128'
|
||||
- '--enable-chunked-prefill'
|
||||
- '--mamba-ssm-cache-dtype=float16'
|
||||
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
|
||||
- '--reasoning-parser=super_v3'
|
||||
- '--max-model-len=1048576'
|
||||
- '--disable-custom-all-reduce'
|
||||
- '--no-disable-hybrid-kv-cache-manager'
|
||||
- '--enforce-eager'
|
||||
- '--kv-transfer-config'
|
||||
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
|
||||
env:
|
||||
- name: PYTHONHASHSEED
|
||||
value: '123'
|
||||
- name: HF_HOME
|
||||
value: /tmp
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: status.podIP
|
||||
- name: PROMETHEUS_MULTIPROC_DIR
|
||||
value: /tmp
|
||||
- name: OMP_NUM_THREADS
|
||||
value: '32'
|
||||
- name: HF_TOKEN
|
||||
value: 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
|
||||
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
|
||||
value: '1'
|
||||
- name: NCCL_TOPO_FILE
|
||||
value: /etc/nccl/virtualTopology.xml
|
||||
- name: PYTORCH_CUDA_ALLOC_CONF
|
||||
value: expandable_segments:True
|
||||
- name: LMCACHE_REMOTE_URL
|
||||
value: redis://10.66.0.100:6379
|
||||
- name: LMCACHE_REMOTE_SERDE
|
||||
value: naive
|
||||
- name: LMCACHE_USE_EXPERIMENTAL
|
||||
value: 'True'
|
||||
- name: VLLM_RPC_TIMEOUT
|
||||
value: '1000000'
|
||||
- name: LMCACHE_LOG_LEVEL
|
||||
value: ERROR
|
||||
- name: LMCACHE_LOCAL_CPU
|
||||
value: 'True'
|
||||
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
|
||||
value: '512'
|
||||
- name: LMCACHE_LMCACHE_INSTANCE_ID
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.name
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||||
imagePullPolicy: Always
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: vllm
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: container-port
|
||||
protocol: TCP
|
||||
- containerPort: 55555
|
||||
name: zmq-port
|
||||
protocol: TCP
|
||||
- containerPort: 9999
|
||||
name: ucx-port
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 5
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
requests:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
securityContext:
|
||||
runAsNonRoot: false
|
||||
startupProbe:
|
||||
failureThreshold: 120
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8000
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 60
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: shm
|
||||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||||
name: kube-api-access-dlhrd
|
||||
readOnly: true
|
||||
dnsPolicy: ClusterFirst
|
||||
enableServiceLinks: true
|
||||
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
|
||||
nodeName: b200-nodepool-d51376abbf32
|
||||
preemptionPolicy: PreemptLowerPriority
|
||||
priority: 0
|
||||
restartPolicy: Always
|
||||
schedulerName: default-scheduler
|
||||
securityContext: {}
|
||||
serviceAccount: default
|
||||
serviceAccountName: default
|
||||
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
|
||||
terminationGracePeriodSeconds: 30
|
||||
tolerations:
|
||||
- effect: NoExecute
|
||||
key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
tolerationSeconds: 300
|
||||
- effect: NoExecute
|
||||
key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
tolerationSeconds: 300
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 64Gi
|
||||
name: shm
|
||||
- name: kube-api-access-dlhrd
|
||||
projected:
|
||||
defaultMode: 420
|
||||
sources:
|
||||
- serviceAccountToken:
|
||||
expirationSeconds: 3607
|
||||
path: token
|
||||
- configMap:
|
||||
items:
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
name: kube-root-ca.crt
|
||||
- downwardAPI:
|
||||
items:
|
||||
- fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.namespace
|
||||
path: namespace
|
||||
status:
|
||||
conditions:
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:32Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: PodReadyToStartContainers
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: Initialized
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
message: 'containers with unready status: [vllm]'
|
||||
observedGeneration: 1
|
||||
reason: ContainersNotReady
|
||||
status: 'False'
|
||||
type: Ready
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
message: 'containers with unready status: [vllm]'
|
||||
observedGeneration: 1
|
||||
reason: ContainersNotReady
|
||||
status: 'False'
|
||||
type: ContainersReady
|
||||
- lastProbeTime: null
|
||||
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||||
observedGeneration: 1
|
||||
status: 'True'
|
||||
type: PodScheduled
|
||||
containerStatuses:
|
||||
- allocatedResources:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
containerID: >-
|
||||
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
|
||||
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||||
imageID: >-
|
||||
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
|
||||
lastState:
|
||||
terminated:
|
||||
containerID: >-
|
||||
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
|
||||
exitCode: 1
|
||||
finishedAt: '2026-04-15T22:42:20Z'
|
||||
reason: Error
|
||||
startedAt: '2026-04-15T22:38:31Z'
|
||||
name: vllm
|
||||
ready: false
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
requests:
|
||||
cpu: '8'
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: '8'
|
||||
rdma/ib: '1'
|
||||
restartCount: 1
|
||||
started: false
|
||||
state:
|
||||
running:
|
||||
startedAt: '2026-04-15T22:42:24Z'
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: shm
|
||||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||||
name: kube-api-access-dlhrd
|
||||
readOnly: true
|
||||
recursiveReadOnly: Disabled
|
||||
hostIP: 10.4.96.13
|
||||
hostIPs:
|
||||
- ip: 10.4.96.13
|
||||
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
|
||||
observedGeneration: 1
|
||||
phase: Running
|
||||
podIP: 10.244.248.111
|
||||
podIPs:
|
||||
- ip: 10.244.248.111
|
||||
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
|
||||
qosClass: Burstable
|
||||
startTime: '2026-04-15T22:38:27Z'
|
||||
Reference in New Issue
Block a user