Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
f65ff24
test-pipeline
afeldman-nm Sep 4, 2025
d5fa076
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 4, 2025
35a331b
basic tests
afeldman-nm Sep 4, 2025
d630671
common and embedding markers
afeldman-nm Sep 4, 2025
7d31d30
parallelize basic tests
afeldman-nm Sep 4, 2025
6c8c6e9
adjust
afeldman-nm Sep 4, 2025
3709f82
wip
afeldman-nm Sep 5, 2025
0a12b97
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 5, 2025
183f97d
revise
afeldman-nm Sep 5, 2025
bba582f
burn changes
afeldman-nm Sep 8, 2025
b2a3d02
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 8, 2025
36a4875
Break out different initialization tests
afeldman-nm Sep 8, 2025
e32e371
shard hybrid models
afeldman-nm Sep 8, 2025
1d883af
Standard language models test
afeldman-nm Sep 8, 2025
774b9e4
small fix
afeldman-nm Sep 9, 2025
4941627
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 9, 2025
1fd0c68
wip
afeldman-nm Sep 9, 2025
0093c1f
typo
afeldman-nm Sep 9, 2025
62e8055
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 9, 2025
0892a96
test
afeldman-nm Sep 10, 2025
a5ecbcf
test
afeldman-nm Sep 10, 2025
fa64ce0
test
afeldman-nm Sep 10, 2025
1ac85fd
test
afeldman-nm Sep 10, 2025
4f4f60c
wip
afeldman-nm Sep 10, 2025
5fc5910
wip
afeldman-nm Sep 10, 2025
39df6a1
wip
afeldman-nm Sep 10, 2025
0b9c6e7
wip
afeldman-nm Sep 10, 2025
8f58547
different approach to sharding
afeldman-nm Sep 10, 2025
d3713c0
merge
afeldman-nm Sep 10, 2025
e70c8cd
percent N
afeldman-nm Sep 10, 2025
682d675
fix
afeldman-nm Sep 10, 2025
ee2b0dc
more sharding
afeldman-nm Sep 10, 2025
4a26031
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 10, 2025
c288ab3
isolate test; lower parallelism
afeldman-nm Sep 11, 2025
6ee7661
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 11, 2025
ff660fc
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 11, 2025
e66b7e6
lower shard factor and reorganize
afeldman-nm Sep 11, 2025
28239d4
explanatory comments
afeldman-nm Sep 11, 2025
009458f
wip
afeldman-nm Sep 11, 2025
63015ab
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 11, 2025
aef7215
test names
afeldman-nm Sep 11, 2025
09e84a4
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 12, 2025
bb15907
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 12, 2025
a806634
Merge branch 'main' into speed_model_ci
afeldman-nm Sep 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 66 additions & 12 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -571,44 +571,98 @@ steps:

##### models test #####

- label: Basic Models Test # 57min
timeout_in_minutes: 75
- label: Basic Models Tests (Initialization)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models
- tests/models/test_initialization.py
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py
- pytest -v -s models/test_vision.py
- pytest -v -s models/test_initialization.py
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset

- label: Language Models Test (Standard) # 35min
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
# subset of supported models (the complement of the small subset in the above
# test.) Also run if model initialization test file is modified
- pytest -v -s models/test_initialization.py \
-k 'not test_can_initialize_small_subset' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2

- label: Basic Models Tests (Other)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/test_transformers.py
- tests/models/test_registry.py
- tests/models/test_utils.py
- tests/models/test_vision.py
commands:
- pytest -v -s models/test_transformers.py \
models/test_registry.py \
models/test_utils.py \
models/test_vision.py

- label: Language Models Tests (Standard)
timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
# Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model
- pytest -v -s models/language -m 'core_model and (not slow_test)'

- label: Language Models Test (Hybrid) # 35 min
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/language/pooling/test_embedding.py
- tests/models/language/generation/test_common.py
- tests/models/language/pooling/test_classification.py
commands:
# Shard slow subset of standard language models tests. Only run when model
# source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2

- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/[email protected]'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/[email protected]'
- pytest -v -s models/language/generation -m hybrid_model
# Shard hybrid language model tests
- pytest -v -s models/language/generation \
-m hybrid_model \
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2

- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ skip_gitignore = true

[tool.pytest.ini_options]
markers = [
"slow_test",
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
Expand Down
14 changes: 10 additions & 4 deletions tests/models/language/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
[
pytest.param(
"bigscience/bloom-560m", # bloom - testing alibi slopes
marks=[pytest.mark.core_model],
marks=[pytest.mark.core_model, pytest.mark.slow_test],
),
pytest.param(
"openai-community/gpt2", # gpt2
Expand All @@ -49,7 +49,10 @@
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
pytest.param(
"google/gemma-1.1-2b-it", # gemma
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
],
),
pytest.param(
"zai-org/chatglm3-6b", # chatglm (text-only)
Expand All @@ -70,14 +73,17 @@
),
pytest.param(
"microsoft/phi-2", # phi
marks=[pytest.mark.core_model],
marks=[pytest.mark.core_model, pytest.mark.slow_test],
),
pytest.param(
"Qwen/Qwen-7B-Chat", # qwen (text-only)
),
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
],
),
pytest.param(
"Qwen/Qwen3-8B", # qwen (text-only)
Expand Down
5 changes: 4 additions & 1 deletion tests/models/language/pooling/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
"model",
[
pytest.param("jason9693/Qwen2.5-1.5B-apeach",
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
]),
],
)
@pytest.mark.parametrize("dtype",
Expand Down
7 changes: 5 additions & 2 deletions tests/models/language/pooling/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# model code with bidirectional attention.
# [Decoder-only]
pytest.param("BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model]),
marks=[pytest.mark.core_model, pytest.mark.slow_test]),
pytest.param(
"intfloat/e5-mistral-7b-instruct",
# CPU v1 doesn't support sliding window
Expand All @@ -29,7 +29,10 @@
# [Encoder-only]
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.slow_test
],
),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
Expand Down
39 changes: 37 additions & 2 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,26 @@
HF_EXAMPLE_MODELS, HfExampleModels)
from .utils import dummy_hf_overrides

# This minimal list of model architectures is smaller than the total list of
# supported models. The intention is that in the "typical" regression testing
# scenario, we only test initializing these models. This subset was chosen
# to include representative examples of model varieties/workloads (conditional
# generation, sequence classification, causal LM, ranking, chat, reward model,
# multimodal, geospatial, voice, embedding, MTP)
MINIMAL_MODEL_ARCH_LIST = [
"LlavaForConditionalGeneration", "Llama4ForConditionalGeneration",
"BertForSequenceClassification", "Gemma3nForCausalLM", "JinaVLForRanking",
"InternVLChatModel", "InternLM2ForRewardModel",
"TransformersForMultimodalLM", "PrithviGeoSpatialMAE", "UltravoxModel",
"DeepSeekMTPModel", "XLMRobertaModel"
]

# This list is the complement of the minimal list above. The intention is that
# this list of models is only tested in a "special case" i.e. most PRs should
# not test these models
OTHER_MODEL_ARCH_LIST = (set(HF_EXAMPLE_MODELS.get_supported_archs()) -
set(MINIMAL_MODEL_ARCH_LIST))


@create_new_process_for_each_test()
def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
Expand Down Expand Up @@ -101,8 +121,23 @@ def _initialize_kv_caches_v1(self, vllm_config):
max_num_seqs=model_info.max_num_seqs)


@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST)
def test_can_initialize_small_subset(model_arch: str,
monkeypatch: pytest.MonkeyPatch):
"""Test initializing small subset of supported models"""
if model_arch == "Lfm2ForCausalLM":
pytest.skip("Skipping until test supports V1-only models")
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)


@pytest.mark.parametrize("model_arch", OTHER_MODEL_ARCH_LIST)
def test_can_initialize_large_subset(model_arch: str,
monkeypatch: pytest.MonkeyPatch):
"""Test initializing large subset of supported models

This test covers the complement of the tests covered in the "small subset"
test.
"""
if model_arch == "Lfm2ForCausalLM":
pytest.skip("Skipping until test supports V1-only models")
can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
Expand Down