Skip to content

Commit 6665204

Browse files
committed
Demonstrate attention routing in Qwen 3
1 parent c8f94c9 commit 6665204

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

Libraries/MLXLLM/Models/Qwen3.swift

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,23 @@ private class Attention: Module {
7676
keys = kNorm(keys.reshaped(B, L, args.kvHeads, -1)).transposed(0, 2, 1, 3)
7777
values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
7878

79+
// Apply RoPE positioning
7980
if let cache {
8081
queries = rope(queries, offset: cache.offset)
8182
keys = rope(keys, offset: cache.offset)
82-
(keys, values) = cache.update(keys: keys, values: values)
8383
} else {
8484
queries = rope(queries)
8585
keys = rope(keys)
8686
}
8787

88-
let output = MLXFast.scaledDotProductAttention(
89-
queries: queries, keys: keys, values: values, scale: scale, mask: mask
88+
// Use the automatic attention router that handles both quantized and regular caches
89+
let output = attentionWithCacheUpdate(
90+
queries: queries,
91+
keys: keys,
92+
values: values,
93+
cache: cache,
94+
scale: scale,
95+
mask: mask
9096
)
9197
.transposed(0, 2, 1, 3)
9298
.reshaped(B, L, -1)

mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,16 @@
6161
</CommandLineArgument>
6262
<CommandLineArgument
6363
argument = "--model mlx-community/Qwen2-VL-2B-Instruct-4bit --prompt &apos;Describe the image in English.&apos; --image https://www.gstatic.com/webp/gallery/1.webp"
64+
isEnabled = "NO">
65+
</CommandLineArgument>
66+
<CommandLineArgument
67+
argument = "--model mlx-community/Qwen3-1.7B-4bit --prompt &quot;Explain quantum computing in simple terms&quot; --max-tokens 100 --kv-bits 4"
6468
isEnabled = "YES">
6569
</CommandLineArgument>
70+
<CommandLineArgument
71+
argument = "--model mlx-community/Qwen3-1.7B-4bit --prompt &quot;Explain quantum computing in simple terms&quot; --max-tokens 100"
72+
isEnabled = "NO">
73+
</CommandLineArgument>
6674
<CommandLineArgument
6775
argument = "--repetition-penalty 1.2"
6876
isEnabled = "NO">

0 commit comments

Comments
 (0)