338 lines
10 KiB
Markdown
338 lines
10 KiB
Markdown
|
|
apiVersion: v1
|
||
|
|
kind: Pod
|
||
|
|
metadata:
|
||
|
|
annotations:
|
||
|
|
cni.projectcalico.org/containerID: 9caa7ab141b172fc71e1ab4e1bc9fc8d13ab4959babc6c2146332ab4bb024dd2
|
||
|
|
cni.projectcalico.org/podIP: 10.244.248.111/32
|
||
|
|
cni.projectcalico.org/podIPs: 10.244.248.111/32,fd10:1ba:6d2c:1000:129b:6fa:8473:78d0/128
|
||
|
|
k8s.v1.cni.cncf.io/network-status: |-
|
||
|
|
[{
|
||
|
|
"name": "k8s-pod-network",
|
||
|
|
"ips": [
|
||
|
|
"10.244.248.111",
|
||
|
|
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||
|
|
],
|
||
|
|
"default": true,
|
||
|
|
"dns": {}
|
||
|
|
},{
|
||
|
|
"name": "vllm/ipoib-network-vllm",
|
||
|
|
"interface": "net1",
|
||
|
|
"ips": [
|
||
|
|
"10.66.0.6"
|
||
|
|
],
|
||
|
|
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||
|
|
"dns": {}
|
||
|
|
}]
|
||
|
|
k8s.v1.cni.cncf.io/networks: ipoib-network-vllm
|
||
|
|
k8s.v1.cni.cncf.io/networks-status: |-
|
||
|
|
[{
|
||
|
|
"name": "k8s-pod-network",
|
||
|
|
"ips": [
|
||
|
|
"10.244.248.111",
|
||
|
|
"fd10:1ba:6d2c:1000:129b:6fa:8473:78d0"
|
||
|
|
],
|
||
|
|
"default": true,
|
||
|
|
"dns": {}
|
||
|
|
},{
|
||
|
|
"name": "vllm/ipoib-network-vllm",
|
||
|
|
"interface": "net1",
|
||
|
|
"ips": [
|
||
|
|
"10.66.0.6"
|
||
|
|
],
|
||
|
|
"mac": "00:00:46:55:fe:80:00:00:00:00:00:00:8c:91:3a:03:00:b6:53:44",
|
||
|
|
"dns": {}
|
||
|
|
}]
|
||
|
|
creationTimestamp: '2026-04-15T22:38:27Z'
|
||
|
|
generateName: production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695-
|
||
|
|
generation: 1
|
||
|
|
labels:
|
||
|
|
app.kubernetes.io/component: serving-engine
|
||
|
|
app.kubernetes.io/instance: production-stack-sea-inference
|
||
|
|
app.kubernetes.io/managed-by: helm
|
||
|
|
app.kubernetes.io/name: nemotron-3-super
|
||
|
|
app.kubernetes.io/part-of: vllm-stack
|
||
|
|
environment: test
|
||
|
|
helm-release-name: production-stack-sea-inference
|
||
|
|
model: nemotron-3-super
|
||
|
|
pod-template-hash: 856dc7d695
|
||
|
|
release: test
|
||
|
|
topology.kubernetes.io/region: sea
|
||
|
|
name: production-stack-sea-inference-nemotron-3-super-deploymenttdwgl
|
||
|
|
namespace: vllm
|
||
|
|
ownerReferences:
|
||
|
|
- apiVersion: apps/v1
|
||
|
|
blockOwnerDeletion: true
|
||
|
|
controller: true
|
||
|
|
kind: ReplicaSet
|
||
|
|
name: >-
|
||
|
|
production-stack-sea-inference-nemotron-3-super-deployment-vllm-856dc7d695
|
||
|
|
uid: 88c04723-f29b-432a-8318-21a9d389cac4
|
||
|
|
resourceVersion: '29767269'
|
||
|
|
uid: 4f9f1445-1095-4d28-89c3-9267ae6cd70e
|
||
|
|
spec:
|
||
|
|
containers:
|
||
|
|
- command:
|
||
|
|
- vllm
|
||
|
|
- serve
|
||
|
|
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||
|
|
- '--host'
|
||
|
|
- 0.0.0.0
|
||
|
|
- '--port'
|
||
|
|
- '8000'
|
||
|
|
- '--no-enable-prefix-caching'
|
||
|
|
- '--tensor-parallel-size'
|
||
|
|
- '8'
|
||
|
|
- '--async-scheduling'
|
||
|
|
- '--dtype=auto'
|
||
|
|
- '--attention-backend=TRITON_ATTN'
|
||
|
|
- '--gpu_memory_utilization=0.96'
|
||
|
|
- '--enable-auto-tool-choice'
|
||
|
|
- '--tool-call-parser=qwen3_coder'
|
||
|
|
- '--trust_remote_code'
|
||
|
|
- '--max-cudagraph-capture-size=128'
|
||
|
|
- '--enable-chunked-prefill'
|
||
|
|
- '--mamba-ssm-cache-dtype=float16'
|
||
|
|
- '--reasoning-parser-plugin=/opt/super_v3_reasoning_parser.py'
|
||
|
|
- '--reasoning-parser=super_v3'
|
||
|
|
- '--max-model-len=1048576'
|
||
|
|
- '--disable-custom-all-reduce'
|
||
|
|
- '--no-disable-hybrid-kv-cache-manager'
|
||
|
|
- '--enforce-eager'
|
||
|
|
- '--kv-transfer-config'
|
||
|
|
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
|
||
|
|
env:
|
||
|
|
- name: PYTHONHASHSEED
|
||
|
|
value: '123'
|
||
|
|
- name: HF_HOME
|
||
|
|
value: /tmp
|
||
|
|
- name: POD_IP
|
||
|
|
valueFrom:
|
||
|
|
fieldRef:
|
||
|
|
apiVersion: v1
|
||
|
|
fieldPath: status.podIP
|
||
|
|
- name: PROMETHEUS_MULTIPROC_DIR
|
||
|
|
value: /tmp
|
||
|
|
- name: OMP_NUM_THREADS
|
||
|
|
value: '32'
|
||
|
|
- name: HF_TOKEN
|
||
|
|
value: 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
|
||
|
|
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
|
||
|
|
value: '1'
|
||
|
|
- name: NCCL_TOPO_FILE
|
||
|
|
value: /etc/nccl/virtualTopology.xml
|
||
|
|
- name: PYTORCH_CUDA_ALLOC_CONF
|
||
|
|
value: expandable_segments:True
|
||
|
|
- name: LMCACHE_REMOTE_URL
|
||
|
|
value: redis://10.66.0.100:6379
|
||
|
|
- name: LMCACHE_REMOTE_SERDE
|
||
|
|
value: naive
|
||
|
|
- name: LMCACHE_USE_EXPERIMENTAL
|
||
|
|
value: 'True'
|
||
|
|
- name: VLLM_RPC_TIMEOUT
|
||
|
|
value: '1000000'
|
||
|
|
- name: LMCACHE_LOG_LEVEL
|
||
|
|
value: ERROR
|
||
|
|
- name: LMCACHE_LOCAL_CPU
|
||
|
|
value: 'True'
|
||
|
|
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
|
||
|
|
value: '512'
|
||
|
|
- name: LMCACHE_LMCACHE_INSTANCE_ID
|
||
|
|
valueFrom:
|
||
|
|
fieldRef:
|
||
|
|
apiVersion: v1
|
||
|
|
fieldPath: metadata.name
|
||
|
|
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||
|
|
imagePullPolicy: Always
|
||
|
|
livenessProbe:
|
||
|
|
failureThreshold: 3
|
||
|
|
httpGet:
|
||
|
|
path: /health
|
||
|
|
port: 8000
|
||
|
|
scheme: HTTP
|
||
|
|
initialDelaySeconds: 15
|
||
|
|
periodSeconds: 10
|
||
|
|
successThreshold: 1
|
||
|
|
timeoutSeconds: 1
|
||
|
|
name: vllm
|
||
|
|
ports:
|
||
|
|
- containerPort: 8000
|
||
|
|
name: container-port
|
||
|
|
protocol: TCP
|
||
|
|
- containerPort: 55555
|
||
|
|
name: zmq-port
|
||
|
|
protocol: TCP
|
||
|
|
- containerPort: 9999
|
||
|
|
name: ucx-port
|
||
|
|
protocol: TCP
|
||
|
|
readinessProbe:
|
||
|
|
failureThreshold: 3
|
||
|
|
httpGet:
|
||
|
|
path: /health
|
||
|
|
port: 8000
|
||
|
|
scheme: HTTP
|
||
|
|
initialDelaySeconds: 15
|
||
|
|
periodSeconds: 5
|
||
|
|
successThreshold: 1
|
||
|
|
timeoutSeconds: 1
|
||
|
|
resources:
|
||
|
|
limits:
|
||
|
|
memory: 1500Gi
|
||
|
|
nvidia.com/gpu: '8'
|
||
|
|
rdma/ib: '1'
|
||
|
|
requests:
|
||
|
|
cpu: '8'
|
||
|
|
memory: 16Gi
|
||
|
|
nvidia.com/gpu: '8'
|
||
|
|
rdma/ib: '1'
|
||
|
|
securityContext:
|
||
|
|
runAsNonRoot: false
|
||
|
|
startupProbe:
|
||
|
|
failureThreshold: 120
|
||
|
|
httpGet:
|
||
|
|
path: /health
|
||
|
|
port: 8000
|
||
|
|
scheme: HTTP
|
||
|
|
initialDelaySeconds: 30
|
||
|
|
periodSeconds: 60
|
||
|
|
successThreshold: 1
|
||
|
|
timeoutSeconds: 1
|
||
|
|
terminationMessagePath: /dev/termination-log
|
||
|
|
terminationMessagePolicy: File
|
||
|
|
volumeMounts:
|
||
|
|
- mountPath: /dev/shm
|
||
|
|
name: shm
|
||
|
|
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||
|
|
name: kube-api-access-dlhrd
|
||
|
|
readOnly: true
|
||
|
|
dnsPolicy: ClusterFirst
|
||
|
|
enableServiceLinks: true
|
||
|
|
hostname: production-stack-sea-inference-nemotron-3-super-vllm-stack
|
||
|
|
nodeName: b200-nodepool-d51376abbf32
|
||
|
|
preemptionPolicy: PreemptLowerPriority
|
||
|
|
priority: 0
|
||
|
|
restartPolicy: Always
|
||
|
|
schedulerName: default-scheduler
|
||
|
|
securityContext: {}
|
||
|
|
serviceAccount: default
|
||
|
|
serviceAccountName: default
|
||
|
|
subdomain: production-stack-sea-inference-nemotron-3-super-engine-service
|
||
|
|
terminationGracePeriodSeconds: 30
|
||
|
|
tolerations:
|
||
|
|
- effect: NoExecute
|
||
|
|
key: node.kubernetes.io/not-ready
|
||
|
|
operator: Exists
|
||
|
|
tolerationSeconds: 300
|
||
|
|
- effect: NoExecute
|
||
|
|
key: node.kubernetes.io/unreachable
|
||
|
|
operator: Exists
|
||
|
|
tolerationSeconds: 300
|
||
|
|
volumes:
|
||
|
|
- emptyDir:
|
||
|
|
medium: Memory
|
||
|
|
sizeLimit: 64Gi
|
||
|
|
name: shm
|
||
|
|
- name: kube-api-access-dlhrd
|
||
|
|
projected:
|
||
|
|
defaultMode: 420
|
||
|
|
sources:
|
||
|
|
- serviceAccountToken:
|
||
|
|
expirationSeconds: 3607
|
||
|
|
path: token
|
||
|
|
- configMap:
|
||
|
|
items:
|
||
|
|
- key: ca.crt
|
||
|
|
path: ca.crt
|
||
|
|
name: kube-root-ca.crt
|
||
|
|
- downwardAPI:
|
||
|
|
items:
|
||
|
|
- fieldRef:
|
||
|
|
apiVersion: v1
|
||
|
|
fieldPath: metadata.namespace
|
||
|
|
path: namespace
|
||
|
|
status:
|
||
|
|
conditions:
|
||
|
|
- lastProbeTime: null
|
||
|
|
lastTransitionTime: '2026-04-15T22:38:32Z'
|
||
|
|
observedGeneration: 1
|
||
|
|
status: 'True'
|
||
|
|
type: PodReadyToStartContainers
|
||
|
|
- lastProbeTime: null
|
||
|
|
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||
|
|
observedGeneration: 1
|
||
|
|
status: 'True'
|
||
|
|
type: Initialized
|
||
|
|
- lastProbeTime: null
|
||
|
|
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||
|
|
message: 'containers with unready status: [vllm]'
|
||
|
|
observedGeneration: 1
|
||
|
|
reason: ContainersNotReady
|
||
|
|
status: 'False'
|
||
|
|
type: Ready
|
||
|
|
- lastProbeTime: null
|
||
|
|
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||
|
|
message: 'containers with unready status: [vllm]'
|
||
|
|
observedGeneration: 1
|
||
|
|
reason: ContainersNotReady
|
||
|
|
status: 'False'
|
||
|
|
type: ContainersReady
|
||
|
|
- lastProbeTime: null
|
||
|
|
lastTransitionTime: '2026-04-15T22:38:27Z'
|
||
|
|
observedGeneration: 1
|
||
|
|
status: 'True'
|
||
|
|
type: PodScheduled
|
||
|
|
containerStatuses:
|
||
|
|
- allocatedResources:
|
||
|
|
cpu: '8'
|
||
|
|
memory: 16Gi
|
||
|
|
nvidia.com/gpu: '8'
|
||
|
|
rdma/ib: '1'
|
||
|
|
containerID: >-
|
||
|
|
containerd://a8114b18618b1a3247085dccfc7ae71c7c4260f5007d1001b1bb6d122969e63b
|
||
|
|
image: atl.vultrcr.com/vllm/vllm-with-lmcache:v0.19.0-cu130
|
||
|
|
imageID: >-
|
||
|
|
atl.vultrcr.com/vllm/vllm-with-lmcache@sha256:d969603ab8bb8c8375a36951ff083942fe5e04607de647f2d89c186947569a59
|
||
|
|
lastState:
|
||
|
|
terminated:
|
||
|
|
containerID: >-
|
||
|
|
containerd://596ffff668ab633b22306dd73ed1a08541187eb39fb4ac2d8187e668dbd7bb2a
|
||
|
|
exitCode: 1
|
||
|
|
finishedAt: '2026-04-15T22:42:20Z'
|
||
|
|
reason: Error
|
||
|
|
startedAt: '2026-04-15T22:38:31Z'
|
||
|
|
name: vllm
|
||
|
|
ready: false
|
||
|
|
resources:
|
||
|
|
limits:
|
||
|
|
memory: 1500Gi
|
||
|
|
nvidia.com/gpu: '8'
|
||
|
|
rdma/ib: '1'
|
||
|
|
requests:
|
||
|
|
cpu: '8'
|
||
|
|
memory: 16Gi
|
||
|
|
nvidia.com/gpu: '8'
|
||
|
|
rdma/ib: '1'
|
||
|
|
restartCount: 1
|
||
|
|
started: false
|
||
|
|
state:
|
||
|
|
running:
|
||
|
|
startedAt: '2026-04-15T22:42:24Z'
|
||
|
|
volumeMounts:
|
||
|
|
- mountPath: /dev/shm
|
||
|
|
name: shm
|
||
|
|
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
|
||
|
|
name: kube-api-access-dlhrd
|
||
|
|
readOnly: true
|
||
|
|
recursiveReadOnly: Disabled
|
||
|
|
hostIP: 10.4.96.13
|
||
|
|
hostIPs:
|
||
|
|
- ip: 10.4.96.13
|
||
|
|
- ip: 2001:19f0:8000:368a:5400:5ff:fefd:de32
|
||
|
|
observedGeneration: 1
|
||
|
|
phase: Running
|
||
|
|
podIP: 10.244.248.111
|
||
|
|
podIPs:
|
||
|
|
- ip: 10.244.248.111
|
||
|
|
- ip: fd10:1ba:6d2c:1000:129b:6fa:8473:78d0
|
||
|
|
qosClass: Burstable
|
||
|
|
startTime: '2026-04-15T22:38:27Z'
|