Spaces:
Running
Running
Commit
·
6b4c072
1
Parent(s):
d9092be
Build: Pin ruff to v0.4.4 in pre-commit to match CI
Browse files- .pre-commit-config.yaml +1 -1
- src/models/encoder.py +3 -3
- src/models/heads.py +6 -6
- tests/test_models/test_decoder.py +3 -3
.pre-commit-config.yaml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
repos:
|
| 2 |
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
-
rev: v0.
|
| 4 |
hooks:
|
| 5 |
- id: ruff
|
| 6 |
args: [ --fix ]
|
|
|
|
| 1 |
repos:
|
| 2 |
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
+
rev: v0.4.4
|
| 4 |
hooks:
|
| 5 |
- id: ruff
|
| 6 |
args: [ --fix ]
|
src/models/encoder.py
CHANGED
|
@@ -160,9 +160,9 @@ class TransformerEncoder(nn.Module):
|
|
| 160 |
Build a 3D attention mask (batch, seq, seq) from input_ids and pad_token_id.
|
| 161 |
True indicates valid positions; False indicates masked (pad).
|
| 162 |
"""
|
| 163 |
-
assert
|
| 164 |
-
|
| 165 |
-
)
|
| 166 |
# mask shape: (batch, seq) where True = token kept (non-pad)
|
| 167 |
pad_mask = input_ids != self.pad_token_id
|
| 168 |
# Convert to (batch, seq_q, seq_k) by outer product broadcasting
|
|
|
|
| 160 |
Build a 3D attention mask (batch, seq, seq) from input_ids and pad_token_id.
|
| 161 |
True indicates valid positions; False indicates masked (pad).
|
| 162 |
"""
|
| 163 |
+
assert (
|
| 164 |
+
self.pad_token_id is not None
|
| 165 |
+
), "pad_token_id must be set to build padding mask from ids."
|
| 166 |
# mask shape: (batch, seq) where True = token kept (non-pad)
|
| 167 |
pad_mask = input_ids != self.pad_token_id
|
| 168 |
# Convert to (batch, seq_q, seq_k) by outer product broadcasting
|
src/models/heads.py
CHANGED
|
@@ -97,12 +97,12 @@ class LMHead(nn.Module):
|
|
| 97 |
|
| 98 |
if tie_embedding is not None:
|
| 99 |
# Validate sizes
|
| 100 |
-
assert
|
| 101 |
-
|
| 102 |
-
)
|
| 103 |
-
assert
|
| 104 |
-
|
| 105 |
-
)
|
| 106 |
# Tie weights: point the projection weight to the embedding weight Tensor
|
| 107 |
# Remove the existing projection parameter in favor of the embedding weight
|
| 108 |
# This keeps the same Parameter object, so updates affect both modules.
|
|
|
|
| 97 |
|
| 98 |
if tie_embedding is not None:
|
| 99 |
# Validate sizes
|
| 100 |
+
assert (
|
| 101 |
+
tie_embedding.num_embeddings == vocab_size
|
| 102 |
+
), "vocab size mismatch for weight tying"
|
| 103 |
+
assert (
|
| 104 |
+
tie_embedding.embedding_dim == d_model
|
| 105 |
+
), "embedding dim must match d_model for weight tying"
|
| 106 |
# Tie weights: point the projection weight to the embedding weight Tensor
|
| 107 |
# Remove the existing projection parameter in favor of the embedding weight
|
| 108 |
# This keeps the same Parameter object, so updates affect both modules.
|
tests/test_models/test_decoder.py
CHANGED
|
@@ -64,9 +64,9 @@ def test_decoder_layer_causal_mask_blocks_future():
|
|
| 64 |
B, H, Tq, Tk = self_attn.shape
|
| 65 |
for i in range(Tq):
|
| 66 |
for j in range(i + 1, Tk):
|
| 67 |
-
assert torch.allclose(
|
| 68 |
-
|
| 69 |
-
)
|
| 70 |
|
| 71 |
|
| 72 |
def test_decoder_stack_and_greedy_decode_shapes():
|
|
|
|
| 64 |
B, H, Tq, Tk = self_attn.shape
|
| 65 |
for i in range(Tq):
|
| 66 |
for j in range(i + 1, Tk):
|
| 67 |
+
assert torch.allclose(
|
| 68 |
+
self_attn[:, :, i, j], torch.zeros(B, H)
|
| 69 |
+
), f"Found nonzero attention to future position {j} from query {i}"
|
| 70 |
|
| 71 |
|
| 72 |
def test_decoder_stack_and_greedy_decode_shapes():
|