[CI] Speed up model unit tests in CI (vllm-project#24253)

afeldman-nm · cboss6 · commit 624de6d6cf9c · 2025-09-16T15:04:44.000+08:00
Signed-off-by: Andrew Feldman &lt;afeldman@redhat.com&gt;
Signed-off-by: bruceszchen &lt;bruceszchen@tencent.com&gt;
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -571,44 +571,98 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 57min
-  timeout_in_minutes: 75
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
-  - tests/models
+  - tests/models/test_initialization.py
   commands:
-    - pytest -v -s models/test_transformers.py
-    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_utils.py
-    - pytest -v -s models/test_vision.py
-    - pytest -v -s models/test_initialization.py
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
 
-- label: Language Models Test (Standard) # 35min
+- label: Basic Models Tests (Extra Initialization) %N
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py \
+             -k 'not test_can_initialize_small_subset' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  commands:
+    - pytest -v -s models/test_transformers.py \
+                   models/test_registry.py \
+                   models/test_utils.py \
+                   models/test_vision.py
+
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
   - vllm/
   - tests/models/language
   commands:
+    # Test standard language models, excluding a subset of slow tests
     - pip freeze | grep -E 'torch'
-    - pytest -v -s models/language -m core_model
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
 
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
+  source_file_dependencies:
   - vllm/
   - tests/models/language/generation
   commands:
     # Install fast path packages for testing against transformers
     # Note: also needed to run plamo2 model in vLLM
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-    - pytest -v -s models/language/generation -m hybrid_model
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation \
+                   -m hybrid_model \
+                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
diff --git a/pyproject.toml b/pyproject.toml
@@ -145,6 +145,7 @@ skip_gitignore = true
 
 [tool.pytest.ini_options]
 markers = [
+    "slow_test",
     "skip_global_cleanup",
     "core_model: enable this model test in each PR instead of only nightly",
     "hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
@@ -38,7 +38,7 @@
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -49,7 +49,10 @@
         pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
         pytest.param(
             "google/gemma-1.1-2b-it",  # gemma
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[
+                pytest.mark.core_model, pytest.mark.cpu_model,
+                pytest.mark.slow_test
+            ],
         ),
         pytest.param(
             "zai-org/chatglm3-6b",  # chatglm (text-only)
@@ -70,14 +73,17 @@
         ),
         pytest.param(
             "microsoft/phi-2",  # phi
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
         ),
         pytest.param(
             "Qwen/Qwen-7B-Chat",  # qwen (text-only)
         ),
         pytest.param(
             "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[
+                pytest.mark.core_model, pytest.mark.cpu_model,
+                pytest.mark.slow_test
+            ],
         ),
         pytest.param(
             "Qwen/Qwen3-8B",  # qwen (text-only)
diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py
@@ -11,7 +11,10 @@
     "model",
     [
         pytest.param("jason9693/Qwen2.5-1.5B-apeach",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+                     marks=[
+                         pytest.mark.core_model, pytest.mark.cpu_model,
+                         pytest.mark.slow_test
+                     ]),
     ],
 )
 @pytest.mark.parametrize("dtype",
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
@@ -19,7 +19,7 @@
         # model code with bidirectional attention.
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
-                     marks=[pytest.mark.core_model]),
+                     marks=[pytest.mark.core_model, pytest.mark.slow_test]),
         pytest.param(
             "intfloat/e5-mistral-7b-instruct",
             # CPU v1 doesn't support sliding window
@@ -29,7 +29,10 @@
         # [Encoder-only]
         pytest.param(
             "BAAI/bge-base-en-v1.5",
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[
+                pytest.mark.core_model, pytest.mark.cpu_model,
+                pytest.mark.slow_test
+            ],
         ),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
@@ -18,6 +18,26 @@
                        HF_EXAMPLE_MODELS, HfExampleModels)
 from .utils import dummy_hf_overrides
 
+# This minimal list of model architectures is smaller than the total list of
+# supported models. The intention is that in the "typical" regression testing
+# scenario, we only test initializing these models. This subset was chosen
+# to include representative examples of model varieties/workloads (conditional
+# generation, sequence classification, causal LM, ranking, chat, reward model,
+# multimodal, geospatial, voice, embedding, MTP)
+MINIMAL_MODEL_ARCH_LIST = [
+    "LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
+    "BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
+    "InternVLChatModel", "InternLM2ForRewardModel",
+    "TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
+    "DeepSeekMTPModel", "XLMRobertaModel"
+]
+
+# This list is the complement of the minimal list above. The intention is that
+# this list of models is only tested in a "special case" i.e. most PRs should
+# not test these models
+OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
+                         set(MINIMAL_MODEL_ARCH_LIST))
+
 
 @create_new_process_for_each_test()
 def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
@@ -101,8 +121,23 @@ def _initialize_kv_caches_v1(self, vllm_config):
             max_num_seqs=model_info.max_num_seqs)
 
 
-@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
-def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
+def test_can_initialize_small_subset(model_arch: str,
+                                     monkeypatch: pytest.MonkeyPatch):
+    """Test initializing small subset of supported models"""
+    if model_arch == "Lfm2ForCausalLM":
+        pytest.skip("Skipping until test supports V1-only models")
+    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
+
+
+@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
+def test_can_initialize_large_subset(model_arch: str,
+                                     monkeypatch: pytest.MonkeyPatch):
+    """Test initializing large subset of supported models
+    
+    This test covers the complement of the tests covered in the "small subset"
+    test.
+    """
     if model_arch == "Lfm2ForCausalLM":
         pytest.skip("Skipping until test supports V1-only models")
     can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,10 @@`
`11`	`11`	`"model",`
`12`	`12`	`[`
`13`	`13`	`pytest.param("jason9693/Qwen2.5-1.5B-apeach",`
`14`		`- marks=[pytest.mark.core_model, pytest.mark.cpu_model]),`
	`14`	`+ marks=[`
	`15`	`+ pytest.mark.core_model, pytest.mark.cpu_model,`
	`16`	`+ pytest.mark.slow_test`
	`17`	`+ ]),`
`15`	`18`	`],`
`16`	`19`	`)`
`17`	`20`	`@pytest.mark.parametrize("dtype",`