# FLAN-T5-base architecture # 12 encoder layers, 12 decoder layers, 768 hidden dim d_model: 768 # Align vocab with FLAN-T5 padded size to avoid weight truncation vocab_size: 32128 num_encoder_layers: 12 # T5-base has 12 layers num_decoder_layers: 12 # T5-base has 12 layers num_attention_heads: 12 ffn_dim: 2048 # T5 uses d_ff = 2048 for base model dropout: 0.1 # Standard dropout activation: gated-gelu # T5/FLAN-T5 uses gated-gelu (GELU activation with gating, not SwiGLU) use_pretrained: true pretrained_model_name: google/flan-t5-base use_relative_position_bias: true # T5 uses relative position bias instead of absolute embeddings gradient_checkpointing: false