Skip to content

Commit 624de6d

Browse files
afeldman-nmcboss6
authored andcommitted
[CI] Speed up model unit tests in CI (vllm-project#24253)
Signed-off-by: Andrew Feldman <[email protected]> Signed-off-by: bruceszchen <[email protected]>
1 parent 2805742 commit 624de6d

File tree

6 files changed

+123
-21
lines changed

6 files changed

+123
-21
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -571,44 +571,98 @@ steps:
571571

572572
##### models test #####
573573

574-
- label: Basic Models Test # 57min
575-
timeout_in_minutes: 75
574+
- label: Basic Models Tests (Initialization)
575+
timeout_in_minutes: 45
576576
mirror_hardwares: [amdexperimental]
577577
torch_nightly: true
578578
source_file_dependencies:
579579
- vllm/
580-
- tests/models
580+
- tests/models/test_initialization.py
581581
commands:
582-
- pytest -v -s models/test_transformers.py
583-
- pytest -v -s models/test_registry.py
584-
- pytest -v -s models/test_utils.py
585-
- pytest -v -s models/test_vision.py
586-
- pytest -v -s models/test_initialization.py
582+
# Run a subset of model initialization tests
583+
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
587584

588-
- label: Language Models Test (Standard) # 35min
585+
- label: Basic Models Tests (Extra Initialization) %N
589586
timeout_in_minutes: 45
590587
mirror_hardwares: [amdexperimental]
591588
torch_nightly: true
592589
source_file_dependencies:
590+
- vllm/model_executor/models/
591+
- tests/models/test_initialization.py
592+
commands:
593+
# Only when vLLM model source is modified - test initialization of a large
594+
# subset of supported models (the complement of the small subset in the above
595+
# test.) Also run if model initialization test file is modified
596+
- pytest -v -s models/test_initialization.py \
597+
-k 'not test_can_initialize_small_subset' \
598+
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
599+
--shard-id=$$BUILDKITE_PARALLEL_JOB
600+
parallelism: 2
601+
602+
- label: Basic Models Tests (Other)
603+
timeout_in_minutes: 45
604+
mirror_hardwares: [amdexperimental]
605+
torch_nightly: true
606+
source_file_dependencies:
607+
- vllm/
608+
- tests/models/test_transformers.py
609+
- tests/models/test_registry.py
610+
- tests/models/test_utils.py
611+
- tests/models/test_vision.py
612+
commands:
613+
- pytest -v -s models/test_transformers.py \
614+
models/test_registry.py \
615+
models/test_utils.py \
616+
models/test_vision.py
617+
618+
- label: Language Models Tests (Standard)
619+
timeout_in_minutes: 25
620+
mirror_hardwares: [amdexperimental]
621+
torch_nightly: true
622+
source_file_dependencies:
593623
- vllm/
594624
- tests/models/language
595625
commands:
626+
# Test standard language models, excluding a subset of slow tests
596627
- pip freeze | grep -E 'torch'
597-
- pytest -v -s models/language -m core_model
628+
- pytest -v -s models/language -m 'core_model and (not slow_test)'
598629

599-
- label: Language Models Test (Hybrid) # 35 min
630+
- label: Language Models Tests (Extra Standard) %N
600631
timeout_in_minutes: 45
601632
mirror_hardwares: [amdexperimental]
602633
torch_nightly: true
603634
source_file_dependencies:
635+
- vllm/model_executor/models/
636+
- tests/models/language/pooling/test_embedding.py
637+
- tests/models/language/generation/test_common.py
638+
- tests/models/language/pooling/test_classification.py
639+
commands:
640+
# Shard slow subset of standard language models tests. Only run when model
641+
# source is modified, or when specified test files are modified
642+
- pip freeze | grep -E 'torch'
643+
- pytest -v -s models/language -m 'core_model and slow_test' \
644+
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
645+
--shard-id=$$BUILDKITE_PARALLEL_JOB
646+
parallelism: 2
647+
648+
- label: Language Models Tests (Hybrid) %N
649+
timeout_in_minutes: 75
650+
mirror_hardwares: [amdexperimental]
651+
torch_nightly: true
652+
source_file_dependencies:
604653
- vllm/
605654
- tests/models/language/generation
606655
commands:
607656
# Install fast path packages for testing against transformers
608657
# Note: also needed to run plamo2 model in vLLM
609658
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
610659
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
611-
- pytest -v -s models/language/generation -m hybrid_model
660+
# Shard hybrid language model tests
661+
- pytest -v -s models/language/generation \
662+
-m hybrid_model \
663+
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
664+
--shard-id=$$BUILDKITE_PARALLEL_JOB
665+
parallelism: 2
612666

613667
- label: Language Models Test (Extended Generation) # 80min
614668
timeout_in_minutes: 110

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ skip_gitignore = true
145145

146146
[tool.pytest.ini_options]
147147
markers = [
148+
"slow_test",
148149
"skip_global_cleanup",
149150
"core_model: enable this model test in each PR instead of only nightly",
150151
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",

tests/models/language/generation/test_common.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
[
3939
pytest.param(
4040
"bigscience/bloom-560m", # bloom - testing alibi slopes
41-
marks=[pytest.mark.core_model],
41+
marks=[pytest.mark.core_model, pytest.mark.slow_test],
4242
),
4343
pytest.param(
4444
"openai-community/gpt2", # gpt2
@@ -49,7 +49,10 @@
4949
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
5050
pytest.param(
5151
"google/gemma-1.1-2b-it", # gemma
52-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
52+
marks=[
53+
pytest.mark.core_model, pytest.mark.cpu_model,
54+
pytest.mark.slow_test
55+
],
5356
),
5457
pytest.param(
5558
"zai-org/chatglm3-6b", # chatglm (text-only)
@@ -70,14 +73,17 @@
7073
),
7174
pytest.param(
7275
"microsoft/phi-2", # phi
73-
marks=[pytest.mark.core_model],
76+
marks=[pytest.mark.core_model, pytest.mark.slow_test],
7477
),
7578
pytest.param(
7679
"Qwen/Qwen-7B-Chat", # qwen (text-only)
7780
),
7881
pytest.param(
7982
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
80-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
83+
marks=[
84+
pytest.mark.core_model, pytest.mark.cpu_model,
85+
pytest.mark.slow_test
86+
],
8187
),
8288
pytest.param(
8389
"Qwen/Qwen3-8B", # qwen (text-only)

tests/models/language/pooling/test_classification.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
"model",
1212
[
1313
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
14-
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
14+
marks=[
15+
pytest.mark.core_model, pytest.mark.cpu_model,
16+
pytest.mark.slow_test
17+
]),
1518
],
1619
)
1720
@pytest.mark.parametrize("dtype",

tests/models/language/pooling/test_embedding.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
# model code with bidirectional attention.
2020
# [Decoder-only]
2121
pytest.param("BAAI/bge-multilingual-gemma2",
22-
marks=[pytest.mark.core_model]),
22+
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
2323
pytest.param(
2424
"intfloat/e5-mistral-7b-instruct",
2525
# CPU v1 doesn't support sliding window
@@ -29,7 +29,10 @@
2929
# [Encoder-only]
3030
pytest.param(
3131
"BAAI/bge-base-en-v1.5",
32-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
32+
marks=[
33+
pytest.mark.core_model, pytest.mark.cpu_model,
34+
pytest.mark.slow_test
35+
],
3336
),
3437
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
3538
pytest.param("intfloat/multilingual-e5-small"),

tests/models/test_initialization.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,26 @@
1818
HF_EXAMPLE_MODELS, HfExampleModels)
1919
from .utils import dummy_hf_overrides
2020

21+
# This minimal list of model architectures is smaller than the total list of
22+
# supported models. The intention is that in the "typical" regression testing
23+
# scenario, we only test initializing these models. This subset was chosen
24+
# to include representative examples of model varieties/workloads (conditional
25+
# generation, sequence classification, causal LM, ranking, chat, reward model,
26+
# multimodal, geospatial, voice, embedding, MTP)
27+
MINIMAL_MODEL_ARCH_LIST = [
28+
"LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
29+
"BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
30+
"InternVLChatModel", "InternLM2ForRewardModel",
31+
"TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
32+
"DeepSeekMTPModel", "XLMRobertaModel"
33+
]
34+
35+
# This list is the complement of the minimal list above. The intention is that
36+
# this list of models is only tested in a "special case" i.e. most PRs should
37+
# not test these models
38+
OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
39+
set(MINIMAL_MODEL_ARCH_LIST))
40+
2141

2242
@create_new_process_for_each_test()
2343
def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
@@ -101,8 +121,23 @@ def _initialize_kv_caches_v1(self, vllm_config):
101121
max_num_seqs=model_info.max_num_seqs)
102122

103123

104-
@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
105-
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
124+
@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
125+
def test_can_initialize_small_subset(model_arch: str,
126+
monkeypatch: pytest.MonkeyPatch):
127+
"""Test initializing small subset of supported models"""
128+
if model_arch == "Lfm2ForCausalLM":
129+
pytest.skip("Skipping until test supports V1-only models")
130+
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
131+
132+
133+
@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
134+
def test_can_initialize_large_subset(model_arch: str,
135+
monkeypatch: pytest.MonkeyPatch):
136+
"""Test initializing large subset of supported models
137+
138+
This test covers the complement of the tests covered in the "small subset"
139+
test.
140+
"""
106141
if model_arch == "Lfm2ForCausalLM":
107142
pytest.skip("Skipping until test supports V1-only models")
108143
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)

0 commit comments

Comments
 (0)