Version
Qwen3-8B
On this page
An 8.2B dense chat model on a single NVIDIA L4. The smallest recipe: one
Standalone engine, no cache, weights pulled straight from Hugging Face.
This recipe was run end to end; the InferenceClass and ModelDeployment are
the exact manifests from that run. Apply the platform side first, then the ML
side.
Platform
inference-class.yaml
# InferenceClass for the L4 shape, validated serving Qwen3-8B on EKS.
#
# One NVIDIA L4 on an EKS g6.xlarge. The single GPU is a claim: DRA device;
# the scheduler matches a ModelDeployment's nodeSelector against its declared
# capacity and DRA binds it to the serving pod.
apiVersion: modelplane.ai/v1alpha1
kind: InferenceClass
metadata:
name: eks-l4-1x-g6
spec:
description: "EKS g6.xlarge, 1x NVIDIA L4"
provisioning:
provider: EKS
eks:
instanceType: g6.xlarge
diskSizeGb: 100
accelerator:
type: nvidia-l4
count: 1
devices:
- name: gpu
claim: DRA
driver: gpu.nvidia.com
deviceClassName: gpu.nvidia.com
count: 1
attributes:
architecture: { string: Ada Lovelace }
capacity:
# The L4's real usable VRAM as the NVIDIA DRA driver reports it, not the
# nominal 24GB.
memory: { value: "23034Mi" }
inference-cluster.yaml
# An EKS InferenceCluster with one L4 node pool, labeled for the
# ModelDeployment's clusterSelector to target.
apiVersion: modelplane.ai/v1alpha1
kind: InferenceCluster
metadata:
name: eks-l4
labels:
modelplane.ai/region: us
spec:
cluster:
source: EKS
eks:
region: us-west-2
nodePools:
- name: gpu-l4
className: eks-l4-1x-g6
nodeCount: 1
minNodeCount: 1
maxNodeCount: 1
zones:
- us-west-2a
Deployment
model-deployment.yaml
# Qwen3-8B served on a single NVIDIA L4, validated end to end on EKS.
#
# An 8.2B dense model is a single Standalone engine: one self-contained vLLM
# pod, no ModelCache, weights pulled straight from Hugging Face. The flags carry
# real meaning beyond fit:
#
# --tool-call-parser=hermes the parser for Qwen3 dense (qwen3_xml is
# for Qwen3-Coder, not this model). Qwen3's
# tool-use template ships in the tokenizer,
# so no --chat-template is needed.
# --reasoning-parser=qwen3 with
# --default-chat-template-kwargs turns thinking off. Qwen3 thinks by
# default, burying a one-line answer under a
# <think> block and forbidding greedy decode.
# --max-model-len / --gpu-memory-utilization L4 fit, not correctness.
#
# No --port or --host: Modelplane's routing expects the engine on its default
# :8000 with a /health probe, and passes args through verbatim.
apiVersion: modelplane.ai/v1alpha1
kind: ModelDeployment
metadata:
name: qwen3-8b
namespace: ml-team
spec:
replicas: 1
clusterSelector:
matchLabels:
modelplane.ai/region: us
engines:
- name: qwen3-8b
members:
- role: Standalone
nodeSelector:
devices:
- name: gpu
count: 1
selectors:
- cel: |
device.capacity["gpu.nvidia.com"].memory.compareTo(quantity("20Gi")) >= 0
template:
spec:
containers:
- name: engine
image: vllm/vllm-openai:v0.23.0
args:
- "--model=Qwen/Qwen3-8B"
- "--served-model-name=qwen"
- "--max-model-len=16384"
- "--gpu-memory-utilization=0.92"
- "--reasoning-parser=qwen3"
- "--default-chat-template-kwargs={\"enable_thinking\": false}"
- "--enable-auto-tool-choice"
- "--tool-call-parser=hermes"
model-service.yaml
# Exposes the qwen3-8b deployment's endpoints as a single OpenAI-compatible URL.
# Modelplane labels each composed ModelEndpoint with the deployment name, so this
# selector reaches every replica. Read the public address from status.address:
# kubectl get ms qwen3-8b -n ml-team -o jsonpath='{.status.address}'
apiVersion: modelplane.ai/v1alpha1
kind: ModelService
metadata:
name: qwen3-8b
namespace: ml-team
spec:
endpoints:
- selector:
matchLabels:
modelplane.ai/deployment: qwen3-8b