# FLAN-T5-base architecture
# 12 encoder layers, 12 decoder layers, 768 hidden dim
d_model: 768
# Align vocab with FLAN-T5 padded size to avoid weight truncation
vocab_size: 32128
num_encoder_layers: 12   # T5-base has 12 layers
num_decoder_layers: 12   # T5-base has 12 layers
num_attention_heads: 12
ffn_dim: 2048  # T5 uses d_ff = 2048 for base model
dropout: 0.1  # Standard dropout
activation: gated-gelu  # T5/FLAN-T5 uses gated-gelu (GELU activation with gating, not SwiGLU)
use_pretrained: true
pretrained_model_name: google/flan-t5-base
use_relative_position_bias: true  # T5 uses relative position bias instead of absolute embeddings
gradient_checkpointing: false