vllm-project · simon-mo · Sep 12, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -546,7 +546,32 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 57min
+# Test subset of models when vLLM source is modified.
+# Always run if test file is modified.
+- label: Basic Models Initialization Test (Subset)
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    - pytest -v -s models/test_initialization.py::test_can_initialize_subset
+
+# Test all other models when any model source is modified.
+# Always run if test file is modified.
+- label: Basic Models Initialization Test (Full)
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    - pytest -v -s models/test_initialization.py::test_can_initialize_other
+
+# Other non-initialization models tests
+- label: Basic Models Test
   timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
@@ -558,9 +583,8 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_utils.py
     - pytest -v -s models/test_vision.py
-    - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard) # 35min
+- label: Standard Language Models Test (Subset)
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
@@ -569,9 +593,65 @@ steps:
   - tests/models/language
   commands:
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m core_model
+    - pytest -v -s models/language -m "core_model and not slow_test"
+
+- label: Standard Language Models Test (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m "core_model and slow_test"
+
+# Shard hybrid language model tests
+- label: Language Models Test Shard 0 (Hybrid)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
+    - pytest -v -s --shard-count=4 --shard-index=0 models/language/generation -m hybrid_model
+
+- label: Language Models Test Shard 1 (Hybrid)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
+    - pytest -v -s --shard-count=4 --shard-index=1 models/language/generation -m hybrid_model
+
+- label: Language Models Test Shard 2 (Hybrid)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
+    - pytest -v -s --shard-count=4 --shard-index=2 models/language/generation -m hybrid_model
 
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Test Shard 3 (Hybrid)
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
@@ -583,7 +663,7 @@ steps:
     # Note: also needed to run plamo2 model in vLLM
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
-    - pytest -v -s models/language/generation -m hybrid_model
+    - pytest -v -s --shard-count=4 --shard-index=3 models/language/generation -m hybrid_model
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110

diff --git a/pyproject.toml b/pyproject.toml
@@ -145,6 +145,7 @@ skip_gitignore = true
 
 [tool.pytest.ini_options]
 markers = [
+    "slow_test",
     "skip_global_cleanup",
     "core_model: enable this model test in each PR instead of only nightly",
     "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",

@@ -39,7 +39,7 @@
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -50,7 +50,10 @@
         pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
         pytest.param(
             "google/gemma-1.1-2b-it",  # gemma
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[
+                pytest.mark.core_model, pytest.mark.cpu_model,
+                pytest.mark.slow_test
+            ],
         ),
         pytest.param(
             "zai-org/chatglm3-6b",  # chatglm (text-only)
@@ -71,14 +74,17 @@
         ),
         pytest.param(
             "microsoft/phi-2",  # phi
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
         ),
         pytest.param(
             "Qwen/Qwen-7B-Chat",  # qwen (text-only)
         ),
         pytest.param(
             "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[
+                pytest.mark.core_model, pytest.mark.cpu_model,
+                pytest.mark.slow_test
+            ],
         ),
         pytest.param(
             "Qwen/Qwen3-8B",  # qwen (text-only)

@@ -11,7 +11,10 @@
     "model",
     [
         pytest.param("jason9693/Qwen2.5-1.5B-apeach",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+                     marks=[
+                         pytest.mark.core_model, pytest.mark.cpu_model,
+                         pytest.mark.slow_test
+                     ]),
     ],
 )
 @pytest.mark.parametrize("dtype",

@@ -19,15 +19,16 @@
         # model code with bidirectional attention.
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
-                     marks=[pytest.mark.core_model]),
+                     marks=[pytest.mark.core_model, pytest.mark.slow_test]),
         pytest.param(
             "intfloat/e5-mistral-7b-instruct",
             # CPU v1 doesn't support sliding window
             marks=[pytest.mark.core_model]),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("BAAI/bge-base-en-v1.5",
+                     marks=[pytest.mark.core_model, pytest.mark.slow_test]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
         # [Cross-Encoder]

@@ -18,6 +18,18 @@
                        HF_EXAMPLE_MODELS, HfExampleModels)
 from .utils import dummy_hf_overrides
 
+MINIMAL_MODEL_ARCH_LIST = [
+    "LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
+    "BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
+    "InternVLChatModel", "InternLM2ForRewardModel",
+    "TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel,"
+    "DeepSeekMTPModel", "MedusaModel", "TransformersModel", "MiDashengLMModel",
+    "XLMRobertaModel"
+]
+
+OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
+                         set(MINIMAL_MODEL_ARCH_LIST))
+
 
 @create_new_process_for_each_test()
 def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
@@ -91,13 +103,22 @@ def _initialize_kv_caches_v1(self, vllm_config):
             max_num_seqs=model_info.max_num_seqs)
 
 
-@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
-def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
+def test_can_initialize_other(model_arch: str,
+                              monkeypatch: pytest.MonkeyPatch):
+    """Test initializing all supported models"""
     if model_arch == "Lfm2ForCausalLM":
         pytest.skip("Skipping until test supports V1-only models")
     can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
 
 
+@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
+def test_can_initialize_subset(model_arch: str,
+                               monkeypatch: pytest.MonkeyPatch):
+    """Test initializing select subset of supported models"""
+    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
+
+
 @pytest.mark.parametrize("model_arch",
                          AUTO_EXAMPLE_MODELS.get_supported_archs())
 def test_implicit_converted_models(model_arch: str,