Spaces:

OliverPerrin
/

LexiMind

Running

OliverPerrin commited on 16 days ago

Commit

590a604

1 Parent(s): 7977c7d

Full training run, code cleanup, mypy/ruff fixes

- Completed 3-epoch full training run (21 hours)
- Emotion F1: 94.6%, Topic Accuracy: 94.2%, ROUGE-like: 0.36
- Code simplification and standardized docstrings across all modules
- Fixed all mypy type errors (47 files pass)
- Fixed all ruff linting errors and reformatted code
- All 75 tests passing
- Added GPU optimizations: TF32, Flash Attention, memory-efficient SDP
- Training configs optimized for RTX 4070 12GB
- tqdm progress bars for training and evaluation

Files changed (41) hide show

configs/config.yaml +7 -0
configs/training/dev.yaml +10 -13
configs/training/full.yaml +7 -7
configs/training/medium.yaml +9 -13
outputs/evaluation_report.json +25 -25
outputs/training_history.json +51 -13
scripts/download_data.py +9 -1
scripts/eval_rouge.py +9 -1
scripts/evaluate.py +125 -143
scripts/export_model.py +9 -1
scripts/export_tokenizer.py +9 -1
scripts/inference.py +9 -1
scripts/preprocess_data.py +10 -1
scripts/train.py +157 -241
src/api/app.py +8 -1
src/api/dependencies.py +8 -1
src/api/routes.py +9 -1
src/api/schemas.py +8 -1
src/data/dataloader.py +43 -34
src/data/dataset.py +10 -1
src/data/preprocessing.py +54 -70
src/data/tokenization.py +10 -1
src/inference/factory.py +9 -1
src/inference/pipeline.py +96 -72
src/inference/postprocessing.py +8 -1
src/models/decoder.py +14 -13
src/models/encoder.py +16 -17
src/models/factory.py +11 -1
src/models/feedforward.py +8 -2
src/models/heads.py +14 -14
src/models/multitask.py +9 -15
src/models/positional_encoding.py +6 -4
src/training/metrics.py +10 -1
src/training/trainer.py +248 -435
src/training/utils.py +9 -1
src/utils/config.py +8 -1
src/utils/io.py +8 -1
src/utils/labels.py +9 -1
src/utils/logging.py +8 -1
src/utils/random.py +8 -1
tests/test_training/test_trainer.py +1 -1

configs/config.yaml CHANGED Viewed

@@ -4,6 +4,13 @@ defaults:
   - training: default
   - _self_
 checkpoint_out: "checkpoints/best.pt"
 labels_out: "artifacts/labels.json"
 history_out: "outputs/training_history.json"

   - training: default
   - _self_
+# Hydra config - prevent output dir conflicts
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
 checkpoint_out: "checkpoints/best.pt"
 labels_out: "artifacts/labels.json"
 history_out: "outputs/training_history.json"

configs/training/dev.yaml CHANGED Viewed

@@ -1,35 +1,32 @@
 # Development/Testing Configuration for FLAN-T5-base
 # Fast iteration for debugging and testing changes
-# Training time: ~10 minutes on RTX 4070 with aot_eager backend
 # Use: python scripts/train.py training=dev
 dataloader:
-  batch_size: 8
   shuffle: true
-  num_workers: 4  # Reduced to avoid overhead
   pin_memory: true
 optimizer:
   name: adamw
-  lr: 5.0e-5  # Higher LR for faster convergence on small dataset
   weight_decay: 0.01
 scheduler:
   name: cosine
-  warmup_steps: 50  # Fewer warmup steps for short training
 trainer:
-  max_epochs: 1  # Single epoch for quick testing
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 1  # No accumulation for speed
-  validation_max_length: 64  # Shorter for faster validation
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0
     emotion: 1.0
     topic: 1.0
-  # Development-specific settings - optimized for ~10 min total
-  max_train_samples: 2000  # Reduced for faster iteration
-  max_val_samples: 200
-  validation_frequency: 1000  # Validate once during training

 # Development/Testing Configuration for FLAN-T5-base
 # Fast iteration for debugging and testing changes
+# Training time: ~3-5 minutes on RTX 4070 12GB
 # Use: python scripts/train.py training=dev
 dataloader:
+  batch_size: 8          # Safe for 12GB VRAM - no shared memory spillover
   shuffle: true
+  num_workers: 4
   pin_memory: true
 optimizer:
   name: adamw
+  lr: 5.0e-5              # Higher LR for fast convergence
   weight_decay: 0.01
 scheduler:
   name: cosine
+  warmup_steps: 50
 trainer:
+  max_epochs: 1
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 1  # No accumulation - maximize throughput
+  validation_max_length: 64
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0
     emotion: 1.0
     topic: 1.0
+  max_train_samples: 2000
+  max_val_samples: 200

configs/training/full.yaml CHANGED Viewed

@@ -1,12 +1,12 @@
 # Full Training Configuration for FLAN-T5-base
 # Complete training run on all data
-# Training time: ~6-8 hours on RTX 4070
 # Use: python scripts/train.py training=full
 dataloader:
-  batch_size: 11  # Reduced for FLAN-T5-base (12 layers)
   shuffle: true
-  num_workers: 8
   pin_memory: true
 optimizer:
@@ -16,12 +16,12 @@ optimizer:
 scheduler:
   name: cosine
-  warmup_steps: 1000  # More warmup for full training
 trainer:
-  max_epochs: 4
-  gradient_clip_norm: 0.5
-  gradient_accumulation_steps: 6  # Effective batch size = 8 * 6 = 48
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:

 # Full Training Configuration for FLAN-T5-base
 # Complete training run on all data
+# Training time: ~6-8 hours on RTX 4070 12GB
 # Use: python scripts/train.py training=full
 dataloader:
+  batch_size: 6          # Optimized for 12GB VRAM
   shuffle: true
+  num_workers: 6
   pin_memory: true
 optimizer:
 scheduler:
   name: cosine
+  warmup_steps: 500       # ~3% of steps
 trainer:
+  max_epochs: 3           # 3 epochs usually sufficient, avoids overfit
+  gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 6  # Effective batch = 36
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:

configs/training/medium.yaml CHANGED Viewed

@@ -1,36 +1,32 @@
 # Medium Configuration for FLAN-T5-base
 # Balanced approach - good results in reasonable time
-# Training time: ~2-3 hours on RTX 4070
 # Use: python scripts/train.py training=medium
-# Note: FLAN-T5-base has 12 layers (vs BART's 6), may need smaller batch
 dataloader:
-  batch_size: 11  # Reduced for FLAN-T5-base (12 layers uses more VRAM)
   shuffle: true
-  num_workers: 8
   pin_memory: true
 optimizer:
   name: adamw
-  lr: 2.0e-5  # Slightly lower for larger model
   weight_decay: 0.01
 scheduler:
   name: cosine
-  warmup_steps: 500  # More warmup for larger model
 trainer:
   max_epochs: 3
-  gradient_clip_norm: 0.5
-  gradient_accumulation_steps: 4  # Effective batch size = 8 * 4 = 32
-  validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0
     emotion: 1.0
     topic: 1.0
-  # Medium dataset - good representative sample
   max_train_samples: 50000
-  max_val_samples: 5000
-  validation_frequency: 5000

 # Medium Configuration for FLAN-T5-base
 # Balanced approach - good results in reasonable time
+# Training time: ~2-3 hours on RTX 4070 12GB
 # Use: python scripts/train.py training=medium
 dataloader:
+  batch_size: 6          # Optimized for 12GB VRAM with accumulation
   shuffle: true
+  num_workers: 6
   pin_memory: true
 optimizer:
   name: adamw
+  lr: 3.0e-5              # Slightly higher - compensates for effective batch
   weight_decay: 0.01
 scheduler:
   name: cosine
+  warmup_steps: 300       # ~5% of steps
 trainer:
   max_epochs: 3
+  gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 3  # Effective batch = 18
+  validation_max_length: 96
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0
     emotion: 1.0
     topic: 1.0
   max_train_samples: 50000
+  max_val_samples: 5000

outputs/evaluation_report.json CHANGED Viewed

@@ -1,44 +1,44 @@
 {
-  "split": "test",
   "summarization": {
-    "rouge_like": 0.3430426484440944,
-    "bleu": 0.0879515124653127
   },
   "emotion": {
-    "f1_macro": 0.3558666706085205
   },
   "topic": {
-    "accuracy": 0.8576315789473684,
     "classification_report": {
       "Business": {
-        "precision": 0.7614165890027959,
-        "recall": 0.86,
-        "f1-score": 0.8077113198220465,
-        "support": 1900
       },
       "Sci/Tech": {
-        "precision": 0.8759791122715405,
-        "recall": 0.7063157894736842,
-        "f1-score": 0.782051282051282,
-        "support": 1900
       },
       "Sports": {
-        "precision": 0.9454638124362895,
-        "recall": 0.9763157894736842,
-        "f1-score": 0.9606421543241843,
-        "support": 1900
       },
       "World": {
-        "precision": 0.8607142857142858,
-        "recall": 0.8878947368421053,
-        "f1-score": 0.8740932642487047,
-        "support": 1900
       },
       "macro avg": {
-        "precision": 0.860893449856228,
-        "recall": 0.8576315789473684,
-        "f1-score": 0.8561245051115545,
-        "support": 7600
       }
     }
   }

 {
+  "split": "val",
   "summarization": {
+    "rouge_like": 0.35947467920968945,
+    "bleu": 0.09027012433010549
   },
   "emotion": {
+    "f1_macro": 0.9455000162124634
   },
   "topic": {
+    "accuracy": 0.94175,
     "classification_report": {
       "Business": {
+        "precision": 0.9319045973038369,
+        "recall": 0.8986666666666666,
+        "f1-score": 0.9149838791786866,
+        "support": 3000
       },
       "Sci/Tech": {
+        "precision": 0.9055627425614489,
+        "recall": 0.9333333333333333,
+        "f1-score": 0.9192383453709784,
+        "support": 3000
       },
       "Sports": {
+        "precision": 0.9856475300400535,
+        "recall": 0.9843333333333333,
+        "f1-score": 0.9849899933288859,
+        "support": 3000
       },
       "World": {
+        "precision": 0.9446836700894335,
+        "recall": 0.9506666666666667,
+        "f1-score": 0.9476657252035222,
+        "support": 3000
       },
       "macro avg": {
+        "precision": 0.9419496349986932,
+        "recall": 0.94175,
+        "f1-score": 0.9417194857705183,
+        "support": 12000
       }
     }
   }

outputs/training_history.json CHANGED Viewed

@@ -1,21 +1,59 @@
 {
   "train_epoch_1": {
-    "summarization_loss": 3.67411927986145,
-    "summarization_rouge_like": 0.39456057390021504,
-    "emotion_loss": 0.5643834336996079,
-    "emotion_f1": 0.023809524163603782,
-    "topic_loss": 1.2467568359375,
-    "topic_accuracy": 0.587,
-    "total_loss": 5.485259549498558,
     "epoch": 1.0
   },
   "val_epoch_1": {
-    "summarization_loss": 3.2498003482818603,
-    "summarization_rouge_like": 0.44230111155579444,
-    "emotion_loss": 0.4288424849510193,
-    "emotion_f1": 0.0,
-    "topic_loss": 0.807373046875,
-    "topic_accuracy": 0.85,
     "epoch": 1.0
   }
 }

 {
   "train_epoch_1": {
+    "summarization_loss": 3.222269726091524,
+    "summarization_rouge_like": 0.4348834303103812,
+    "emotion_loss": 0.2681197640352259,
+    "emotion_f1": 0.4939010590246358,
+    "topic_loss": 0.2817161389551497,
+    "topic_accuracy": 0.9126178087058748,
+    "total_loss": 3.7721057520380095,
     "epoch": 1.0
   },
   "val_epoch_1": {
+    "summarization_loss": 2.9376416314440097,
+    "summarization_rouge_like": 0.4621969238397049,
+    "emotion_loss": 0.07456208207925424,
+    "emotion_f1": 0.922451647864638,
+    "topic_loss": 0.18789680490184146,
+    "topic_accuracy": 0.9368641532016696,
     "epoch": 1.0
+  },
+  "train_epoch_2": {
+    "summarization_loss": 3.0815064049717713,
+    "summarization_rouge_like": 0.44604443152864864,
+    "emotion_loss": 0.04770229796717623,
+    "emotion_f1": 0.9407868445694336,
+    "topic_loss": 0.1507136240392336,
+    "topic_accuracy": 0.9498742677227413,
+    "total_loss": 3.279922429068798,
+    "epoch": 2.0
+  },
+  "val_epoch_2": {
+    "summarization_loss": 2.8898715693603942,
+    "summarization_rouge_like": 0.4654528613816311,
+    "emotion_loss": 0.05001389549380918,
+    "emotion_f1": 0.9344953305524384,
+    "topic_loss": 0.1755385091801308,
+    "topic_accuracy": 0.9435966487133395,
+    "epoch": 2.0
+  },
+  "train_epoch_3": {
+    "summarization_loss": 3.0340622767404044,
+    "summarization_rouge_like": 0.4502876682264882,
+    "emotion_loss": 0.025708710505635942,
+    "emotion_f1": 0.9647584015837614,
+    "topic_loss": 0.11707986947991166,
+    "topic_accuracy": 0.9614479064357344,
+    "total_loss": 3.176850952874497,
+    "epoch": 3.0
+  },
+  "val_epoch_3": {
+    "summarization_loss": 2.865455434181104,
+    "summarization_rouge_like": 0.46790124713702563,
+    "emotion_loss": 0.05574661032417156,
+    "emotion_f1": 0.940105742034193,
+    "topic_loss": 0.19245651335709887,
+    "topic_accuracy": 0.942998204667858,
+    "epoch": 3.0
   }
 }

scripts/download_data.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Download datasets used by LexiMind."""
 from __future__ import annotations

+"""
+Dataset download script for LexiMind.
+Downloads training datasets from various sources including HuggingFace Hub,
+Kaggle, and Project Gutenberg. Handles automatic conversion to JSONL format.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

scripts/eval_rouge.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Utility script to evaluate LexiMind summaries with ROUGE."""
 from __future__ import annotations

+"""
+ROUGE evaluation script for LexiMind.
+Computes ROUGE-1, ROUGE-2, and ROUGE-L scores on summarization outputs
+with support for batched inference and customizable metrics.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

scripts/evaluate.py CHANGED Viewed

@@ -1,6 +1,11 @@
 """
-Evaluate the multitask model on processed validation/test splits.
-This is used for getting definitive scores on my test set after training is complete.
 """
 from __future__ import annotations
@@ -8,9 +13,12 @@ from __future__ import annotations
 import argparse
 import json
 import sys
 from pathlib import Path
-from typing import Any, List, cast
 import torch
 from sklearn.preprocessing import MultiLabelBinarizer
 from tqdm import tqdm
@@ -19,14 +27,7 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(PROJECT_ROOT))
-import matplotlib.pyplot as plt
-import seaborn as sns
-from src.data.dataset import (
-    load_emotion_jsonl,
-    load_summarization_jsonl,
-    load_topic_jsonl,
-)
 from src.inference.factory import create_inference_pipeline
 from src.training.metrics import (
     accuracy,
@@ -38,80 +39,67 @@ from src.training.metrics import (
 )
 from src.utils.config import load_yaml
-SPLIT_ALIASES = {
-    "train": ("train",),
-    "val": ("val", "validation"),
-    "test": ("test",),
-}
-def _read_split(root: Path, split: str, loader) -> List[Any]:
-    aliases = SPLIT_ALIASES.get(split, (split,))
-    for alias in aliases:
         for ext in ("jsonl", "json"):
-            candidate = root / f"{alias}.{ext}"
-            if candidate.exists():
-                return cast(List[Any], loader(str(candidate)))
-    raise FileNotFoundError(f"Missing {split} split under {root}")
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Evaluate the LexiMind multitask model")
-    parser.add_argument(
-        "--split",
-        default="val",
-        choices=["train", "val", "test"],
-        help="Dataset split to evaluate.",
-    )
-    parser.add_argument(
-        "--checkpoint", default="checkpoints/best.pt", help="Path to the trained checkpoint."
-    )
-    parser.add_argument("--labels", default="artifacts/labels.json", help="Label metadata JSON.")
-    parser.add_argument(
-        "--data-config", default="configs/data/datasets.yaml", help="Data configuration YAML."
-    )
-    parser.add_argument(
-        "--model-config", default="configs/model/base.yaml", help="Model architecture YAML."
-    )
-    parser.add_argument(
-        "--device",
-        default="cuda" if torch.cuda.is_available() else "cpu",
-        help="Device for evaluation.",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=16,
-        help="Batch size for generation/classification during evaluation.",
-    )
-    parser.add_argument(
-        "--output-dir", default="outputs", help="Directory to save evaluation artifacts."
-    )
-    return parser.parse_args()
-def chunks(items: List, size: int):
-    for start in range(0, len(items), size):
-        yield items[start : start + size]
-def plot_confusion_matrix(cm, labels, output_path):
     plt.figure(figsize=(10, 8))
     sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
     plt.xlabel("Predicted")
     plt.ylabel("True")
     plt.title("Topic Classification Confusion Matrix")
     plt.tight_layout()
-    plt.savefig(output_path)
     plt.close()
 def main() -> None:
     args = parse_args()
-    data_cfg = load_yaml(args.data_config).data
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     pipeline, metadata = create_inference_pipeline(
         checkpoint_path=args.checkpoint,
         labels_path=args.labels,
@@ -120,100 +108,94 @@ def main() -> None:
         device=args.device,
     )
-    summarization_dir = Path(data_cfg["processed"]["summarization"])
-    emotion_dir = Path(data_cfg["processed"]["emotion"])
-    topic_dir = Path(data_cfg["processed"]["topic"])
-    summary_examples = _read_split(summarization_dir, args.split, load_summarization_jsonl)
-    emotion_examples = _read_split(emotion_dir, args.split, load_emotion_jsonl)
-    topic_examples = _read_split(topic_dir, args.split, load_topic_jsonl)
-    emotion_binarizer = MultiLabelBinarizer(classes=metadata.emotion)
-    # Ensure scikit-learn initializes the attributes using metadata ordering.
-    emotion_binarizer.fit([[label] for label in metadata.emotion])
-    # Summarization
-    print("Evaluating Summarization...")
-    summaries_pred = []
-    summaries_ref = []
-    total_batches = (len(summary_examples) + args.batch_size - 1) // args.batch_size
-    for batch in tqdm(
-        chunks(summary_examples, args.batch_size),
-        total=total_batches,
-        desc="Summarization",
-        unit="batch",
-    ):
-        inputs = [example.source for example in batch]
-        summaries_pred.extend(pipeline.summarize(inputs))
-        summaries_ref.extend([example.summary for example in batch])
-    rouge_score = rouge_like(summaries_pred, summaries_ref)
-    bleu_score = calculate_bleu(summaries_pred, summaries_ref)
-    # Emotion
-    print("Evaluating Emotion Classification...")
-    emotion_preds_tensor = []
-    emotion_target_tensor = []
-    label_to_index = {label: idx for idx, label in enumerate(metadata.emotion)}
-    total_batches = (len(emotion_examples) + args.batch_size - 1) // args.batch_size
-    # Lower threshold to 0.3 to catch weak signals, or use argmax if appropriate
-    # For now, we'll stick to thresholding but lower it.
-    inference_threshold = 0.3
-    for batch in tqdm(
-        chunks(emotion_examples, args.batch_size), total=total_batches, desc="Emotion", unit="batch"
-    ):
-        inputs = [example.text for example in batch]
-        predictions = pipeline.predict_emotions(inputs, threshold=inference_threshold)
-        target_matrix = emotion_binarizer.transform([list(example.emotions) for example in batch])
-        for pred, target_row in zip(predictions, target_matrix, strict=False):
-            vector = torch.zeros(len(metadata.emotion), dtype=torch.float32)
-            for label in pred.labels:
-                idx = label_to_index.get(label)
-                if idx is not None:
-                    vector[idx] = 1.0
-            emotion_preds_tensor.append(vector)
-            emotion_target_tensor.append(torch.tensor(target_row, dtype=torch.float32))
-    emotion_f1 = multilabel_f1(
-        torch.stack(emotion_preds_tensor), torch.stack(emotion_target_tensor)
     )
-    # Topic
-    print("Evaluating Topic Classification...")
-    topic_preds = []
-    topic_targets = []
-    total_batches = (len(topic_examples) + args.batch_size - 1) // args.batch_size
-    for batch in tqdm(
-        chunks(topic_examples, args.batch_size), total=total_batches, desc="Topic", unit="batch"
-    ):
-        inputs = [example.text for example in batch]
-        topic_predictions = pipeline.predict_topics(inputs)
-        topic_preds.extend([pred.label for pred in topic_predictions])
-        topic_targets.extend([example.topic for example in batch])
-    topic_accuracy = accuracy(topic_preds, topic_targets)
-    topic_report = classification_report_dict(topic_preds, topic_targets, labels=metadata.topic)
-    topic_cm = get_confusion_matrix(topic_preds, topic_targets, labels=metadata.topic)
-    # Save Confusion Matrix
     cm_path = output_dir / "topic_confusion_matrix.png"
     plot_confusion_matrix(topic_cm, metadata.topic, cm_path)
-    print(f"Confusion matrix saved to {cm_path}")
     results = {
         "split": args.split,
-        "summarization": {"rouge_like": rouge_score, "bleu": bleu_score},
         "emotion": {"f1_macro": emotion_f1},
-        "topic": {"accuracy": topic_accuracy, "classification_report": topic_report},
     }
     report_path = output_dir / "evaluation_report.json"
-    with open(report_path, "w", encoding="utf-8") as f:
         json.dump(results, f, indent=2)
-    print(f"Evaluation complete. Report saved to {report_path}")
     print(json.dumps(results, indent=2))

 """
+Evaluation script for LexiMind.
+Computes ROUGE/BLEU for summarization, multi-label F1 for emotion,
+and accuracy with confusion matrix for topic classification.
+Author: Oliver Perrin
+Date: December 2025
 """
 from __future__ import annotations
 import argparse
 import json
 import sys
+import time
 from pathlib import Path
+from typing import Any, Callable, List
+import matplotlib.pyplot as plt
+import seaborn as sns
 import torch
 from sklearn.preprocessing import MultiLabelBinarizer
 from tqdm import tqdm
 if str(PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(PROJECT_ROOT))
+from src.data.dataset import load_emotion_jsonl, load_summarization_jsonl, load_topic_jsonl
 from src.inference.factory import create_inference_pipeline
 from src.training.metrics import (
     accuracy,
 )
 from src.utils.config import load_yaml
+# --------------- Data Loading ---------------
+SPLIT_ALIASES = {"train": ("train",), "val": ("val", "validation"), "test": ("test",)}
+def load_split(root: Path, split: str, loader: Callable[[str], List[Any]]) -> List[Any]:
+    """Load a dataset split, checking aliases."""
+    for alias in SPLIT_ALIASES.get(split, (split,)):
         for ext in ("jsonl", "json"):
+            path = root / f"{alias}.{ext}"
+            if path.exists():
+                return list(loader(str(path)))
+    raise FileNotFoundError(f"Missing {split} split in {root}")
+def chunks(items: List, size: int):
+    """Yield batches of items."""
+    for i in range(0, len(items), size):
+        yield items[i : i + size]
+# --------------- Visualization ---------------
+def plot_confusion_matrix(cm, labels, path: Path) -> None:
+    """Save confusion matrix heatmap."""
     plt.figure(figsize=(10, 8))
     sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
     plt.xlabel("Predicted")
     plt.ylabel("True")
     plt.title("Topic Classification Confusion Matrix")
     plt.tight_layout()
+    plt.savefig(path)
     plt.close()
+# --------------- Main ---------------
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Evaluate LexiMind")
+    p.add_argument("--split", default="val", choices=["train", "val", "test"])
+    p.add_argument("--checkpoint", default="checkpoints/best.pt")
+    p.add_argument("--labels", default="artifacts/labels.json")
+    p.add_argument("--data-config", default="configs/data/datasets.yaml")
+    p.add_argument("--model-config", default="configs/model/base.yaml")
+    p.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
+    p.add_argument("--batch-size", type=int, default=148)  # Larger batch for inference (no grads)
+    p.add_argument("--output-dir", default="outputs")
+    return p.parse_args()
 def main() -> None:
     args = parse_args()
+    start_time = time.perf_counter()
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
+    # Load pipeline
+    print("Loading model...")
     pipeline, metadata = create_inference_pipeline(
         checkpoint_path=args.checkpoint,
         labels_path=args.labels,
         device=args.device,
     )
+    # Load data
+    data_cfg = load_yaml(args.data_config).data
+    summ_data = load_split(
+        Path(data_cfg["processed"]["summarization"]), args.split, load_summarization_jsonl
+    )
+    emot_data = load_split(Path(data_cfg["processed"]["emotion"]), args.split, load_emotion_jsonl)
+    topic_data = load_split(Path(data_cfg["processed"]["topic"]), args.split, load_topic_jsonl)
+    print(f"\nEvaluating on {args.split} split:")
+    print(f"  Summarization: {len(summ_data)} samples")
+    print(f"  Emotion: {len(emot_data)} samples")
+    print(f"  Topic: {len(topic_data)} samples")
+    # --------------- Summarization ---------------
+    print("\nSummarization...")
+    preds, refs = [], []
+    for batch in tqdm(list(chunks(summ_data, args.batch_size)), desc="Summarization", unit="batch"):
+        preds.extend(pipeline.summarize([ex.source for ex in batch]))
+        refs.extend([ex.summary for ex in batch])
+    rouge = rouge_like(preds, refs)
+    bleu = calculate_bleu(preds, refs)
+    print(f"  ROUGE-like: {rouge:.4f}, BLEU: {bleu:.4f}")
+    # --------------- Emotion ---------------
+    print("\nEmotion Classification...")
+    binarizer = MultiLabelBinarizer(classes=metadata.emotion)
+    binarizer.fit([[label] for label in metadata.emotion])
+    label_idx = {label: i for i, label in enumerate(metadata.emotion)}
+    pred_vecs, target_vecs = [], []
+    for batch in tqdm(list(chunks(emot_data, args.batch_size)), desc="Emotion", unit="batch"):
+        emotion_results = pipeline.predict_emotions([ex.text for ex in batch], threshold=0.3)
+        targets = binarizer.transform([list(ex.emotions) for ex in batch])
+        for pred, target in zip(emotion_results, targets, strict=False):
+            vec = torch.zeros(len(metadata.emotion))
+            for lbl in pred.labels:
+                if lbl in label_idx:
+                    vec[label_idx[lbl]] = 1.0
+            pred_vecs.append(vec)
+            target_vecs.append(torch.tensor(target, dtype=torch.float32))
+    emotion_f1 = multilabel_f1(torch.stack(pred_vecs), torch.stack(target_vecs))
+    print(f"  F1 (macro): {emotion_f1:.4f}")
+    # --------------- Topic ---------------
+    print("\nTopic Classification...")
+    topic_pred_labels: List[str] = []
+    topic_true_labels: List[str] = []
+    for batch in tqdm(list(chunks(topic_data, args.batch_size)), desc="Topic", unit="batch"):
+        topic_results = pipeline.predict_topics([ex.text for ex in batch])
+        topic_pred_labels.extend([r.label for r in topic_results])
+        topic_true_labels.extend([ex.topic for ex in batch])
+    topic_acc = accuracy(topic_pred_labels, topic_true_labels)
+    topic_report = classification_report_dict(
+        topic_pred_labels, topic_true_labels, labels=metadata.topic
     )
+    topic_cm = get_confusion_matrix(topic_pred_labels, topic_true_labels, labels=metadata.topic)
+    print(f"  Accuracy: {topic_acc:.4f}")
+    # Save confusion matrix
     cm_path = output_dir / "topic_confusion_matrix.png"
     plot_confusion_matrix(topic_cm, metadata.topic, cm_path)
+    print(f"  Confusion matrix saved: {cm_path}")
+    # --------------- Save Results ---------------
     results = {
         "split": args.split,
+        "summarization": {"rouge_like": rouge, "bleu": bleu},
         "emotion": {"f1_macro": emotion_f1},
+        "topic": {"accuracy": topic_acc, "classification_report": topic_report},
     }
     report_path = output_dir / "evaluation_report.json"
+    with open(report_path, "w") as f:
         json.dump(results, f, indent=2)
+    total_time = time.perf_counter() - start_time
+    print(f"\n{'=' * 50}")
+    print(f"Evaluation complete in {total_time:.1f}s")
+    print(f"Report saved: {report_path}")
+    print(f"{'=' * 50}")
     print(json.dumps(results, indent=2))

scripts/export_model.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Rebuild and export the trained multitask model for downstream use."""
 from __future__ import annotations

+"""
+Model export script for LexiMind.
+Rebuilds the multitask model from configuration and exports trained weights
+for deployment or distribution.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

scripts/export_tokenizer.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Export the FLAN-T5 tokenizer to the artifacts directory for reproducible inference."""
 from __future__ import annotations

+"""
+Tokenizer export script for LexiMind.
+Saves the FLAN-T5 tokenizer to the artifacts directory for reproducible
+inference without requiring network access.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

scripts/inference.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Run inference with the multitask model."""
 from __future__ import annotations

+"""
+Inference script for the LexiMind multitask model.
+Command-line interface for running summarization, emotion detection, and topic
+classification on arbitrary text inputs.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

scripts/preprocess_data.py CHANGED Viewed

@@ -1,4 +1,13 @@
-"""Preprocess raw datasets into JSONL splits for LexiMind training."""
 from __future__ import annotations

+"""
+Data preprocessing script for LexiMind.
+Transforms raw datasets into standardized JSONL splits for training. Handles
+summarization, emotion classification, topic classification, and book paragraph
+extraction with text cleaning.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

scripts/train.py CHANGED Viewed

@@ -1,13 +1,20 @@
-"""End-to-end training entrypoint for the LexiMind multitask model."""
 from __future__ import annotations
 import json
-import platform
 import sys
-import warnings
 from pathlib import Path
-from typing import Any, Dict, Sequence, Tuple, cast
 import hydra
 import torch
@@ -37,8 +44,7 @@ from src.training.utils import set_seed
 from src.utils.io import save_state
 from src.utils.labels import LabelMetadata, save_label_metadata
-SplitExamples = Dict[str, list]
 SPLIT_ALIASES: Dict[str, Sequence[str]] = {
     "train": ("train",),
@@ -47,286 +53,214 @@ SPLIT_ALIASES: Dict[str, Sequence[str]] = {
 }
-def _read_examples(data_dir: Path, loader) -> SplitExamples:
-    splits: SplitExamples = {}
-    for canonical, aliases in SPLIT_ALIASES.items():
-        found = False
         for alias in aliases:
-            for extension in ("jsonl", "json"):
-                candidate = data_dir / f"{alias}.{extension}"
-                if candidate.exists():
-                    splits[canonical] = loader(str(candidate))
-                    found = True
                     break
-            if found:
                 break
-        if not found:
-            raise FileNotFoundError(f"Missing {canonical} split under {data_dir}")
     return splits
-def _limit_samples(splits: SplitExamples, trainer_cfg: DictConfig) -> None:
-    """Limit the number of samples in train/val splits if configured."""
-    max_train = trainer_cfg.get("max_train_samples")
-    max_val = trainer_cfg.get("max_val_samples")
-    if max_train is not None and "train" in splits:
-        original_len = len(splits["train"])
-        limit = int(max_train)
-        if original_len > limit:
-            splits["train"] = splits["train"][:limit]
-            print(f"Limited 'train' split from {original_len} to {limit} samples")
-    if max_val is not None and "val" in splits:
-        original_len = len(splits["val"])
-        limit = int(max_val)
-        if original_len > limit:
-            splits["val"] = splits["val"][:limit]
-            print(f"Limited 'val' split from {original_len} to {limit} samples")
-def compile_model_safe(model: torch.nn.Module) -> Tuple[Any, str]:
-    """
-    Safely compile model with best available backend.
-    Returns:
-        Compiled model and backend name used
-    """
-    system = platform.system()
-    # NOTE: The 'inductor' backend causes NaN gradients during backward pass with
-    # bfloat16 autocast on the decoder (seq2seq tasks). This is a known issue.
-    # Use 'aot_eager' which provides graph optimization without inductor's codegen.
-    # See: debug_compile_config.py and test_compile_modes.py for investigation.
-    # Try aot_eager first - it's stable and provides good speedup
-    try:
-        print("Attempting to compile with 'aot_eager' backend...")
-        compiled_model = torch.compile(model, backend="aot_eager")
-        print("✓ Successfully compiled with 'aot_eager' backend")
-        return cast(torch.nn.Module, compiled_model), "aot_eager"
-    except Exception as e:
-        warnings.warn(f"aot_eager backend failed: {e}", stacklevel=2)
-    # Fallback: Try other backends (inductor may work for encoder-only tasks)
-    backends_to_try = ["eager"]
-    if system != "Windows":
-        # On Linux, inductor might work for some configurations
-        backends_to_try = ["eager", "inductor"]
-    for backend in backends_to_try:
-        try:
-            print(f"Attempting to compile with '{backend}' backend...")
-            compiled_model = torch.compile(model, backend=backend)
-            # Trigger a dummy run or just return? torch.compile is lazy.
-            # I assume it works if the call succeeds, runtime errors handled later.
-            print(f"✓ Successfully compiled with '{backend}' backend")
-            return cast(torch.nn.Module, compiled_model), backend
-        except Exception as e:
-            print(f"✗ '{backend}' backend failed: {e}")
-            continue
-    # No compilation worked, return original model
-    warnings.warn("All torch.compile backends failed, using uncompiled model", stacklevel=2)
-    return model, "none"
 @hydra.main(version_base=None, config_path="../configs", config_name="config")
 def main(cfg: DictConfig) -> None:
     print(OmegaConf.to_yaml(cfg))
     set_seed(cfg.seed)
-    # Enable TF32 for Ampere/Ada GPUs (RTX 30xx/40xx)
-    # This provides significant speedup on RTX 4070
     if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
-        print("Enabling TF32 for Ampere/Ada GPU...")
         torch.set_float32_matmul_precision("high")
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
-        torch.backends.cudnn.benchmark = True  # Auto-tunes convolution algorithms
-    # Access configs directly from Hydra cfg object
     data_cfg = cfg.data
-    training_cfg = cfg.training
-    # Instantiate ModelConfig directly from cfg.model
-    model_cfg = ModelConfig(
-        d_model=cfg.model.d_model,
-        num_encoder_layers=cfg.model.num_encoder_layers,
-        num_decoder_layers=cfg.model.num_decoder_layers,
-        num_attention_heads=cfg.model.num_attention_heads,
-        ffn_dim=cfg.model.ffn_dim,
-        dropout=cfg.model.dropout,
-        use_pretrained=cfg.model.use_pretrained,
-        pretrained_model_name=cfg.model.pretrained_model_name,
-        activation=getattr(cfg.model, "activation", "gelu"),
-        use_relative_position_bias=getattr(cfg.model, "use_relative_position_bias", False),
-    )
-    summarization_dir = Path(data_cfg.processed.summarization)
-    emotion_dir = Path(data_cfg.processed.emotion)
-    topic_dir = Path(data_cfg.processed.topic)
-    summarization_splits = _read_examples(summarization_dir, load_summarization_jsonl)
-    emotion_splits = _read_examples(emotion_dir, load_emotion_jsonl)
-    topic_splits = _read_examples(topic_dir, load_topic_jsonl)
-    # Apply sample limits if configured (e.g. for dev/medium runs)
-    trainer_cfg = training_cfg.get("trainer", {})
-    print("\nApplying dataset limits...")
-    _limit_samples(summarization_splits, trainer_cfg)
-    _limit_samples(emotion_splits, trainer_cfg)
-    _limit_samples(topic_splits, trainer_cfg)
-    print("Dataset limits applied.\n")
-    tokenizer_section = data_cfg.get("tokenizer", {})
-    tokenizer_config = TokenizerConfig(
-        pretrained_model_name=tokenizer_section.get("pretrained_model_name", "google/flan-t5-base"),
-        max_length=int(tokenizer_section.get("max_length", 512)),
-        lower=bool(tokenizer_section.get("lower", False)),
-    )
-    tokenizer = Tokenizer(tokenizer_config)
-    summarization_train = SummarizationDataset(summarization_splits["train"])
-    summarization_val = SummarizationDataset(summarization_splits["val"])
-    emotion_train = EmotionDataset(emotion_splits["train"])
-    emotion_val = EmotionDataset(emotion_splits["val"], binarizer=emotion_train.binarizer)
     topic_train = TopicDataset(topic_splits["train"])
     topic_val = TopicDataset(topic_splits["val"], encoder=topic_train.encoder)
-    dataloader_args = training_cfg.get("dataloader", {})
-    batch_size = int(dataloader_args.get("batch_size", 8))
-    shuffle = bool(dataloader_args.get("shuffle", True))
-    # Optimization: Use multiple workers and pinned memory for faster data transfer
-    num_workers = int(dataloader_args.get("num_workers", 4))
-    pin_memory = bool(dataloader_args.get("pin_memory", True))
-    max_length = tokenizer.config.max_length
     train_loaders = {
         "summarization": build_summarization_dataloader(
-            summarization_train,
             tokenizer,
             batch_size=batch_size,
-            shuffle=shuffle,
-            max_source_length=max_length,
-            max_target_length=max_length,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "emotion": build_emotion_dataloader(
-            emotion_train,
             tokenizer,
             batch_size=batch_size,
-            shuffle=shuffle,
-            max_length=max_length,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "topic": build_topic_dataloader(
             topic_train,
             tokenizer,
             batch_size=batch_size,
-            shuffle=shuffle,
-            max_length=max_length,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
     }
     val_loaders = {
         "summarization": build_summarization_dataloader(
-            summarization_val,
             tokenizer,
-            batch_size=batch_size,
             shuffle=False,
-            max_source_length=max_length,
-            max_target_length=max_length,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "emotion": build_emotion_dataloader(
-            emotion_val,
             tokenizer,
-            batch_size=batch_size,
             shuffle=False,
-            max_length=max_length,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "topic": build_topic_dataloader(
             topic_val,
             tokenizer,
-            batch_size=batch_size,
             shuffle=False,
-            max_length=max_length,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
     }
     device = torch.device(cfg.device)
     model = build_multitask_model(
         tokenizer,
-        num_emotions=len(emotion_train.emotion_classes),
         num_topics=len(topic_train.topic_classes),
         config=model_cfg,
     ).to(device)
-    optimizer_cfg = training_cfg.get("optimizer", {})
-    lr = float(optimizer_cfg.get("lr", 3.0e-5))
-    # Add weight decay for regularization to prevent overfitting
-    weight_decay = float(optimizer_cfg.get("weight_decay", 0.01))
-    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
-    # Optimize model execution graph with torch.compile (PyTorch 2.0+)
-    # This fuses kernels and reduces overhead for faster training
-    # Note: We only compile encoder/decoder for training, not the step() method used in generation
-    # Compile encoder and decoder separately to avoid control flow issues in MultiTaskModel.forward
-    # Compiling the top-level model causes excessive recompilation due to task switching
-    use_compile = True  # torch.compile for faster training
-    if use_compile and model.encoder is not None:
-        model.encoder, backend_used = compile_model_safe(model.encoder)
-    else:
-        backend_used = "disabled"
-    if use_compile and model.decoder is not None:
-        # Compile decoder.forward but keep step/greedy_decode uncompiled for generation
-        model.decoder, _ = compile_model_safe(model.decoder)
-    # Compile heads
-    if use_compile:
-        for name, head in model.heads.items():
-            compiled_head, _ = compile_model_safe(head)
-            model.heads[name] = compiled_head
-            # Update the registered module as well to ensure parameters are tracked correctly
-            setattr(model, f"head_{name}", compiled_head)
-    print(f"Using compilation backend: {backend_used}")
-    # Verify weights loaded correctly (check for NaNs/Infs)
-    print("\n=== Weight Loading Verification ===")
-    has_issues = False
-    for name, param in model.named_parameters():
-        if torch.isnan(param).any():
-            print(f"WARNING: NaN in {name}")
-            has_issues = True
-        if torch.isinf(param).any():
-            print(f"WARNING: Inf in {name}")
-            has_issues = True
-    if not has_issues:
-        print("✓ No NaNs or Infs found in model parameters.")
-    print("=== Verification Complete ===\n")
-    trainer_cfg = training_cfg.get("trainer", {})
     trainer = Trainer(
         model=model,
         optimizer=optimizer,
         config=TrainerConfig(
             max_epochs=int(trainer_cfg.get("max_epochs", 1)),
             gradient_clip_norm=float(trainer_cfg.get("gradient_clip_norm", 1.0)),
-            logging_interval=int(trainer_cfg.get("logging_interval", 50)),
             task_weights=trainer_cfg.get("task_weights"),
             label_smoothing=float(trainer_cfg.get("label_smoothing", 0.0)),
             gradient_accumulation_steps=int(trainer_cfg.get("gradient_accumulation_steps", 1)),
@@ -335,61 +269,43 @@ def main(cfg: DictConfig) -> None:
         tokenizer=tokenizer,
     )
-    # Save checkpoint after every epoch to avoid losing good early checkpoints
-    # Previous training showed overfitting at epoch 5 but good results at epoch 3
-    def save_epoch_checkpoint(epoch: int, model: torch.nn.Module, history: Dict) -> None:
-        epoch_path = Path(cfg.checkpoint_out).parent / f"epoch_{epoch}.pt"
-        epoch_path.parent.mkdir(parents=True, exist_ok=True)
-        save_state(model, str(epoch_path))
-        print(f"Checkpoint saved: {epoch_path}")
-    history = trainer.fit(train_loaders, val_loaders, checkpoint_callback=save_epoch_checkpoint)
-    checkpoint_path = Path(cfg.checkpoint_out)
-    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
-    save_state(model, str(checkpoint_path))
     labels_path = Path(cfg.labels_out)
     save_label_metadata(
-        LabelMetadata(
-            emotion=emotion_train.emotion_classes,
-            topic=topic_train.topic_classes,
-        ),
         labels_path,
     )
     history_path = Path(cfg.history_out)
     history_path.parent.mkdir(parents=True, exist_ok=True)
-    with history_path.open("w", encoding="utf-8") as handle:
-        json.dump(history, handle, indent=2)
-    print(f"Training complete. Checkpoint saved to {checkpoint_path}")
-    print(f"Label metadata saved to {labels_path}")
-    print(f"History saved to {history_path}")
-    # Run evaluation pipeline
-    print("\nRunning evaluation pipeline...")
-    import subprocess
-    try:
-        subprocess.run(
-            [
-                sys.executable,
-                "scripts/evaluate.py",
-                "--split",
-                "test",  # Evaluate on test set
-                "--checkpoint",
-                str(checkpoint_path),
-                "--labels",
-                str(labels_path),
-                "--output-dir",
-                "outputs",
-            ],
-            check=True,
-        )
-        print("Evaluation pipeline completed successfully.")
-    except subprocess.CalledProcessError as e:
-        print(f"Evaluation pipeline failed with error: {e}")
 if __name__ == "__main__":

+"""
+Training script for LexiMind.
+Orchestrates dataset loading, model construction, torch.compile optimization,
+and multi-task training with checkpoint management.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations
 import json
 import sys
+import time
 from pathlib import Path
+from typing import Any, Dict, Sequence
 import hydra
 import torch
 from src.utils.io import save_state
 from src.utils.labels import LabelMetadata, save_label_metadata
+# --------------- Data Loading ---------------
 SPLIT_ALIASES: Dict[str, Sequence[str]] = {
     "train": ("train",),
 }
+def load_splits(data_dir: Path, loader) -> Dict[str, list]:
+    """Load train/val/test splits from data directory."""
+    splits = {}
+    for name, aliases in SPLIT_ALIASES.items():
         for alias in aliases:
+            for ext in ("jsonl", "json"):
+                path = data_dir / f"{alias}.{ext}"
+                if path.exists():
+                    splits[name] = loader(str(path))
                     break
+            if name in splits:
                 break
+        if name not in splits:
+            raise FileNotFoundError(f"Missing {name} split in {data_dir}")
     return splits
+def limit_samples(splits: Dict[str, list], cfg: DictConfig) -> None:
+    """Apply sample limits for dev/debug runs."""
+    for split, key in [("train", "max_train_samples"), ("val", "max_val_samples")]:
+        limit = cfg.get(key)
+        if limit and split in splits and len(splits[split]) > limit:
+            splits[split] = splits[split][: int(limit)]
+            print(f"  {split}: limited to {limit} samples")
+# --------------- Model Compilation ---------------
+def compile_model(model: torch.nn.Module) -> Any:
+    """Compile model with aot_eager backend (stable, avoids inductor NaN issues)."""
+    try:
+        compiled = torch.compile(model, backend="aot_eager")
+        print("✓ Compiled with aot_eager")
+        return compiled
+    except Exception:
+        return model
+# --------------- Main ---------------
 @hydra.main(version_base=None, config_path="../configs", config_name="config")
 def main(cfg: DictConfig) -> None:
+    start_time = time.perf_counter()
     print(OmegaConf.to_yaml(cfg))
     set_seed(cfg.seed)
+    # Enable TF32 for Ampere+ GPUs (RTX 30xx/40xx) - ~2x matmul speedup
     if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
+        print("✓ TF32 enabled for Ampere GPU")
         torch.set_float32_matmul_precision("high")
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cudnn.benchmark = True  # Auto-tune convolutions
+        torch.backends.cuda.enable_flash_sdp(True)  # Flash attention if available
+        torch.backends.cuda.enable_mem_efficient_sdp(True)  # Memory-efficient attention
+    # Disable debug APIs for max speed
+    torch.autograd.set_detect_anomaly(False)
+    torch.autograd.profiler.profile(False)
+    torch.autograd.profiler.emit_nvtx(False)
+    # --------------- Load Data ---------------
     data_cfg = cfg.data
+    trainer_cfg = cfg.training.get("trainer", {})
+    print("\nLoading datasets...")
+    summ_splits = load_splits(Path(data_cfg.processed.summarization), load_summarization_jsonl)
+    emot_splits = load_splits(Path(data_cfg.processed.emotion), load_emotion_jsonl)
+    topic_splits = load_splits(Path(data_cfg.processed.topic), load_topic_jsonl)
+    # Apply dev/debug sample limits
+    for splits in [summ_splits, emot_splits, topic_splits]:
+        limit_samples(splits, trainer_cfg)
+    # --------------- Tokenizer & Datasets ---------------
+    tok_cfg = data_cfg.get("tokenizer", {})
+    tokenizer = Tokenizer(
+        TokenizerConfig(
+            pretrained_model_name=tok_cfg.get("pretrained_model_name", "google/flan-t5-base"),
+            max_length=int(tok_cfg.get("max_length", 512)),
+            lower=bool(tok_cfg.get("lower", False)),
+        )
+    )
+    summ_train = SummarizationDataset(summ_splits["train"])
+    summ_val = SummarizationDataset(summ_splits["val"])
+    emot_train = EmotionDataset(emot_splits["train"])
+    emot_val = EmotionDataset(emot_splits["val"], binarizer=emot_train.binarizer)
     topic_train = TopicDataset(topic_splits["train"])
     topic_val = TopicDataset(topic_splits["val"], encoder=topic_train.encoder)
+    # --------------- DataLoaders ---------------
+    dl_cfg = cfg.training.get("dataloader", {})
+    batch_size = int(dl_cfg.get("batch_size", 8))
+    num_workers = int(dl_cfg.get("num_workers", 4))
+    pin_memory = bool(dl_cfg.get("pin_memory", True))
+    max_len = tokenizer.config.max_length
     train_loaders = {
         "summarization": build_summarization_dataloader(
+            summ_train,
             tokenizer,
+            shuffle=True,
+            max_source_length=max_len,
+            max_target_length=max_len,
             batch_size=batch_size,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "emotion": build_emotion_dataloader(
+            emot_train,
             tokenizer,
+            shuffle=True,
+            max_length=max_len,
             batch_size=batch_size,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "topic": build_topic_dataloader(
             topic_train,
             tokenizer,
+            shuffle=True,
+            max_length=max_len,
             batch_size=batch_size,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
     }
     val_loaders = {
         "summarization": build_summarization_dataloader(
+            summ_val,
             tokenizer,
             shuffle=False,
+            max_source_length=max_len,
+            max_target_length=max_len,
+            batch_size=batch_size,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "emotion": build_emotion_dataloader(
+            emot_val,
             tokenizer,
             shuffle=False,
+            max_length=max_len,
+            batch_size=batch_size,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
         "topic": build_topic_dataloader(
             topic_val,
             tokenizer,
             shuffle=False,
+            max_length=max_len,
+            batch_size=batch_size,
             num_workers=num_workers,
             pin_memory=pin_memory,
         ),
     }
+    # --------------- Model ---------------
+    print("\nBuilding model...")
     device = torch.device(cfg.device)
+    model_cfg = ModelConfig(
+        d_model=cfg.model.d_model,
+        num_encoder_layers=cfg.model.num_encoder_layers,
+        num_decoder_layers=cfg.model.num_decoder_layers,
+        num_attention_heads=cfg.model.num_attention_heads,
+        ffn_dim=cfg.model.ffn_dim,
+        dropout=cfg.model.dropout,
+        use_pretrained=cfg.model.use_pretrained,
+        pretrained_model_name=cfg.model.pretrained_model_name,
+        activation=getattr(cfg.model, "activation", "gelu"),
+        use_relative_position_bias=getattr(cfg.model, "use_relative_position_bias", False),
+    )
     model = build_multitask_model(
         tokenizer,
+        num_emotions=len(emot_train.emotion_classes),
         num_topics=len(topic_train.topic_classes),
         config=model_cfg,
     ).to(device)
+    # Compile encoder/decoder for faster training (skip heads - small overhead)
+    if model.encoder is not None:
+        model.encoder = compile_model(model.encoder)
+    if model.decoder is not None:
+        model.decoder = compile_model(model.decoder)
+    # --------------- Optimizer & Trainer ---------------
+    opt_cfg = cfg.training.get("optimizer", {})
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=float(opt_cfg.get("lr", 3e-5)),
+        weight_decay=float(opt_cfg.get("weight_decay", 0.01)),
+    )
     trainer = Trainer(
         model=model,
         optimizer=optimizer,
         config=TrainerConfig(
             max_epochs=int(trainer_cfg.get("max_epochs", 1)),
             gradient_clip_norm=float(trainer_cfg.get("gradient_clip_norm", 1.0)),
             task_weights=trainer_cfg.get("task_weights"),
             label_smoothing=float(trainer_cfg.get("label_smoothing", 0.0)),
             gradient_accumulation_steps=int(trainer_cfg.get("gradient_accumulation_steps", 1)),
         tokenizer=tokenizer,
     )
+    # --------------- Train ---------------
+    def save_checkpoint(epoch: int, model: torch.nn.Module, history: Dict) -> None:
+        path = Path(cfg.checkpoint_out).parent / f"epoch_{epoch}.pt"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        save_state(model, str(path))
+    print("\nStarting training...")
+    history = trainer.fit(train_loaders, val_loaders, checkpoint_callback=save_checkpoint)
+    # --------------- Save Outputs ---------------
+    # Best checkpoint
+    ckpt_path = Path(cfg.checkpoint_out)
+    ckpt_path.parent.mkdir(parents=True, exist_ok=True)
+    save_state(model, str(ckpt_path))
+    # Labels
     labels_path = Path(cfg.labels_out)
     save_label_metadata(
+        LabelMetadata(emotion=emot_train.emotion_classes, topic=topic_train.topic_classes),
         labels_path,
     )
+    # History
     history_path = Path(cfg.history_out)
     history_path.parent.mkdir(parents=True, exist_ok=True)
+    with history_path.open("w") as f:
+        json.dump(history, f, indent=2)
+    total_time = time.perf_counter() - start_time
+    print(f"\n{'=' * 50}")
+    print(f"Training complete in {total_time:.1f}s")
+    print(f"  Checkpoint: {ckpt_path}")
+    print(f"  Labels: {labels_path}")
+    print(f"  History: {history_path}")
+    print(f"{'=' * 50}")
 if __name__ == "__main__":

src/api/app.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""FastAPI application entrypoint."""
 from fastapi import FastAPI

+"""
+FastAPI application factory for LexiMind.
+Creates and configures the REST API application.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from fastapi import FastAPI

src/api/dependencies.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""Dependency providers for the FastAPI application."""
 from __future__ import annotations

+"""
+FastAPI dependency providers for LexiMind.
+Manages lazy initialization and caching of the inference pipeline.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/api/routes.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""API routes."""
 from typing import cast

+"""
+API routes for LexiMind.
+Defines REST endpoints for text analysis including summarization,
+emotion detection, and topic classification.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from typing import cast

src/api/schemas.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""API schemas."""
 from pydantic import BaseModel

+"""
+Pydantic schemas for LexiMind API.
+Defines request and response models for the REST API.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from pydantic import BaseModel

src/data/dataloader.py CHANGED Viewed

@@ -1,8 +1,15 @@
-"""Task-aware DataLoader builders for the LexiMind multitask suite."""
 from __future__ import annotations
-from typing import List
 import torch
 from torch.utils.data import DataLoader
@@ -17,9 +24,11 @@ from .dataset import (
 )
 from .tokenization import Tokenizer
 class SummarizationCollator:
-    """Prepare encoder-decoder batches for abstractive summarization."""
     def __init__(
         self,
@@ -32,36 +41,24 @@ class SummarizationCollator:
         self.max_source_length = max_source_length
         self.max_target_length = max_target_length
-    def __call__(self, batch: List[SummarizationExample]) -> dict[str, torch.Tensor]:
-        sources = [example.source for example in batch]
-        targets = [example.summary for example in batch]
-        source_enc = self.tokenizer.batch_encode(sources, max_length=self.max_source_length)
-        target_enc = self.tokenizer.batch_encode(targets, max_length=self.max_target_length)
-        # target_enc["input_ids"] is [BOS, A, B, EOS, PAD...]
-        # We want:
-        # tgt_ids (decoder input): [BOS, A, B, EOS] (drop last PAD or EOS if full)
-        # labels (target): [A, B, EOS, PAD] (drop first BOS)
-        ids = target_enc["input_ids"]
-        mask = target_enc["attention_mask"]
-        # Slice to create shifted inputs/targets
-        # tgt_ids: everything except the last token
         tgt_ids = ids[:, :-1]
-        # labels: everything except the first token (BOS)
         labels = ids[:, 1:].clone()
-        # Adjust mask for labels to ignore padding
-        # The mask corresponds to the original ids. We slice it to match labels.
-        labels_mask = mask[:, 1:]
-        labels[labels_mask == 0] = -100
         return {
-            "src_ids": source_enc["input_ids"],
-            "src_mask": source_enc["attention_mask"],
             "tgt_ids": tgt_ids,
             "labels": labels,
         }
@@ -77,11 +74,13 @@ class EmotionCollator:
         self.binarizer = dataset.binarizer
         self.max_length = max_length
-    def __call__(self, batch: List[EmotionExample]) -> dict[str, torch.Tensor]:
-        texts = [example.text for example in batch]
         encoded = self.tokenizer.batch_encode(texts, max_length=self.max_length)
-        label_array = self.binarizer.transform([example.emotions for example in batch])
-        labels = torch.as_tensor(label_array, dtype=torch.float32)
         return {
             "input_ids": encoded["input_ids"],
             "attention_mask": encoded["attention_mask"],
@@ -90,7 +89,7 @@ class EmotionCollator:
 class TopicCollator:
-    """Prepare batches for topic classification using the projection head."""
     def __init__(
         self, tokenizer: Tokenizer, dataset: TopicDataset, *, max_length: int | None = None
@@ -99,11 +98,12 @@ class TopicCollator:
         self.encoder = dataset.encoder
         self.max_length = max_length
-    def __call__(self, batch: List[TopicExample]) -> dict[str, torch.Tensor]:
-        texts = [example.text for example in batch]
         encoded = self.tokenizer.batch_encode(texts, max_length=self.max_length)
         labels = torch.as_tensor(
-            self.encoder.transform([example.topic for example in batch]), dtype=torch.long
         )
         return {
             "input_ids": encoded["input_ids"],
@@ -112,6 +112,9 @@ class TopicCollator:
         }
 def build_summarization_dataloader(
     dataset: SummarizationDataset,
     tokenizer: Tokenizer,
@@ -123,6 +126,7 @@ def build_summarization_dataloader(
     num_workers: int = 0,
     pin_memory: bool = False,
 ) -> DataLoader:
     collator = SummarizationCollator(
         tokenizer,
         max_source_length=max_source_length,
@@ -135,6 +139,7 @@ def build_summarization_dataloader(
         collate_fn=collator,
         num_workers=num_workers,
         pin_memory=pin_memory,
     )
@@ -148,6 +153,7 @@ def build_emotion_dataloader(
     num_workers: int = 0,
     pin_memory: bool = False,
 ) -> DataLoader:
     collator = EmotionCollator(tokenizer, dataset, max_length=max_length)
     return DataLoader(
         dataset,
@@ -156,6 +162,7 @@ def build_emotion_dataloader(
         collate_fn=collator,
         num_workers=num_workers,
         pin_memory=pin_memory,
     )
@@ -169,6 +176,7 @@ def build_topic_dataloader(
     num_workers: int = 0,
     pin_memory: bool = False,
 ) -> DataLoader:
     collator = TopicCollator(tokenizer, dataset, max_length=max_length)
     return DataLoader(
         dataset,
@@ -177,4 +185,5 @@ def build_topic_dataloader(
         collate_fn=collator,
         num_workers=num_workers,
         pin_memory=pin_memory,
     )

+"""
+DataLoader builders for LexiMind.
+Task-specific collators and factory functions for summarization, emotion, and topic.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations
+from typing import Dict, List
 import torch
 from torch.utils.data import DataLoader
 )
 from .tokenization import Tokenizer
+# --------------- Collators ---------------
 class SummarizationCollator:
+    """Prepare encoder-decoder batches for seq2seq summarization."""
     def __init__(
         self,
         self.max_source_length = max_source_length
         self.max_target_length = max_target_length
+    def __call__(self, batch: List[SummarizationExample]) -> Dict[str, torch.Tensor]:
+        sources = [ex.source for ex in batch]
+        targets = [ex.summary for ex in batch]
+        src_enc = self.tokenizer.batch_encode(sources, max_length=self.max_source_length)
+        tgt_enc = self.tokenizer.batch_encode(targets, max_length=self.max_target_length)
+        # Shift targets: tgt_ids = [BOS, A, B], labels = [A, B, EOS]
+        ids = tgt_enc["input_ids"]
+        mask = tgt_enc["attention_mask"]
         tgt_ids = ids[:, :-1]
         labels = ids[:, 1:].clone()
+        labels[mask[:, 1:] == 0] = -100  # Mask padding in loss
         return {
+            "src_ids": src_enc["input_ids"],
+            "src_mask": src_enc["attention_mask"],
             "tgt_ids": tgt_ids,
             "labels": labels,
         }
         self.binarizer = dataset.binarizer
         self.max_length = max_length
+    def __call__(self, batch: List[EmotionExample]) -> Dict[str, torch.Tensor]:
+        texts = [ex.text for ex in batch]
         encoded = self.tokenizer.batch_encode(texts, max_length=self.max_length)
+        labels = torch.as_tensor(
+            self.binarizer.transform([ex.emotions for ex in batch]),
+            dtype=torch.float32,
+        )
         return {
             "input_ids": encoded["input_ids"],
             "attention_mask": encoded["attention_mask"],
 class TopicCollator:
+    """Prepare batches for single-label topic classification."""
     def __init__(
         self, tokenizer: Tokenizer, dataset: TopicDataset, *, max_length: int | None = None
         self.encoder = dataset.encoder
         self.max_length = max_length
+    def __call__(self, batch: List[TopicExample]) -> Dict[str, torch.Tensor]:
+        texts = [ex.text for ex in batch]
         encoded = self.tokenizer.batch_encode(texts, max_length=self.max_length)
         labels = torch.as_tensor(
+            self.encoder.transform([ex.topic for ex in batch]),
+            dtype=torch.long,
         )
         return {
             "input_ids": encoded["input_ids"],
         }
+# --------------- Factory Functions ---------------
 def build_summarization_dataloader(
     dataset: SummarizationDataset,
     tokenizer: Tokenizer,
     num_workers: int = 0,
     pin_memory: bool = False,
 ) -> DataLoader:
+    """Create dataloader for summarization task."""
     collator = SummarizationCollator(
         tokenizer,
         max_source_length=max_source_length,
         collate_fn=collator,
         num_workers=num_workers,
         pin_memory=pin_memory,
+        persistent_workers=num_workers > 0,  # Keep workers alive between epochs
     )
     num_workers: int = 0,
     pin_memory: bool = False,
 ) -> DataLoader:
+    """Create dataloader for emotion classification task."""
     collator = EmotionCollator(tokenizer, dataset, max_length=max_length)
     return DataLoader(
         dataset,
         collate_fn=collator,
         num_workers=num_workers,
         pin_memory=pin_memory,
+        persistent_workers=num_workers > 0,
     )
     num_workers: int = 0,
     pin_memory: bool = False,
 ) -> DataLoader:
+    """Create dataloader for topic classification task."""
     collator = TopicCollator(tokenizer, dataset, max_length=max_length)
     return DataLoader(
         dataset,
         collate_fn=collator,
         num_workers=num_workers,
         pin_memory=pin_memory,
+        persistent_workers=num_workers > 0,
     )

src/data/dataset.py CHANGED Viewed

@@ -1,4 +1,13 @@
-"""Dataset definitions for the LexiMind multitask training pipeline."""
 from __future__ import annotations

+"""
+Dataset definitions for the LexiMind multitask training pipeline.
+Defines PyTorch Dataset classes and data loading utilities for summarization,
+emotion classification, and topic classification tasks. Supports both JSON
+array and JSONL file formats.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/data/preprocessing.py CHANGED Viewed

@@ -1,52 +1,64 @@
-"""Text preprocessing utilities built around Hugging Face tokenizers."""
 from __future__ import annotations
 from dataclasses import dataclass, replace
-from typing import Iterable, List, Sequence
 import torch
-from sklearn.base import BaseEstimator, TransformerMixin
 from .tokenization import Tokenizer, TokenizerConfig
-class BasicTextCleaner(BaseEstimator, TransformerMixin):
-    """Minimal text cleaner following scikit-learn conventions."""
-    def __init__(self, lowercase: bool = True, strip: bool = True) -> None:
         self.lowercase = lowercase
-        self.strip = strip
-    def fit(self, texts: Iterable[str], y: Iterable[str] | None = None):
-        return self
-    def transform(self, texts: Iterable[str]) -> List[str]:
-        return [self._clean_text(text) for text in texts]
-    def _clean_text(self, text: str) -> str:
-        item = text.strip() if self.strip else text
-        if self.lowercase:
-            item = item.lower()
-        return " ".join(item.split())
 @dataclass
 class Batch:
-    """Bundle of tensors returned by the text preprocessor."""
     input_ids: torch.Tensor
     attention_mask: torch.Tensor
     lengths: List[int]
-class TextPreprocessor:
-    """Coordinate lightweight text cleaning and tokenization.
-    When supplying an already-initialized tokenizer instance, its configuration is left
-    untouched. If a differing ``max_length`` is requested, a ``ValueError`` is raised to
-    avoid mutating shared tokenizer state.
-    """
     def __init__(
         self,
@@ -56,19 +68,10 @@ class TextPreprocessor:
         tokenizer_name: str = "google/flan-t5-base",
         max_length: int | None = None,
         lowercase: bool = True,
-        remove_stopwords: bool = False,
-        sklearn_transformer: TransformerMixin | None = None,
     ) -> None:
-        self.cleaner = BasicTextCleaner(lowercase=lowercase, strip=True)
-        self.lowercase = lowercase
-        if remove_stopwords:
-            raise ValueError(
-                "Stop-word removal is not supported because it conflicts with subword tokenizers; "
-                "clean the text externally before initializing TextPreprocessor."
-            )
-        self._stop_words = None
-        self._sklearn_transformer = sklearn_transformer
         if tokenizer is None:
             cfg = tokenizer_config or TokenizerConfig(pretrained_model_name=tokenizer_name)
             if max_length is not None:
@@ -78,52 +81,33 @@ class TextPreprocessor:
             self.tokenizer = tokenizer
             if max_length is not None and max_length != tokenizer.config.max_length:
                 raise ValueError(
-                    "Provided tokenizer config.max_length does not match requested max_length; "
-                    "initialise the tokenizer with desired settings before passing it in."
                 )
         self.max_length = max_length or self.tokenizer.config.max_length
     def clean_text(self, text: str) -> str:
-        item = self.cleaner.transform([text])[0]
-        return self._normalize_tokens(item)
-    def _normalize_tokens(self, text: str) -> str:
-        """Apply token-level normalization and optional stop-word filtering."""
-        # Note: Pre-tokenization word-splitting is incompatible with subword tokenizers.
-        # Stop-word filtering should be done post-tokenization or not at all for transformers.
-        return text
-    def _apply_sklearn_transform(self, texts: List[str]) -> List[str]:
-        if self._sklearn_transformer is None:
-            return texts
-        transform = getattr(self._sklearn_transformer, "transform", None)
-        if transform is None:
-            raise AttributeError("Provided sklearn transformer must implement a 'transform' method")
-        transformed = transform(texts)
-        if isinstance(transformed, list):
-            return transformed  # assume downstream type is already list[str]
-        if hasattr(transformed, "tolist"):
-            transformed = transformed.tolist()
-        result = list(transformed)
-        if not all(isinstance(item, str) for item in result):
-            result = [str(item) for item in result]
-        return result
-    def _prepare_texts(self, texts: Sequence[str]) -> List[str]:
-        cleaned = self.cleaner.transform(texts)
-        normalized = [self._normalize_tokens(text) for text in cleaned]
-        return self._apply_sklearn_transform(normalized)
     def batch_encode(self, texts: Sequence[str]) -> Batch:
-        cleaned = self._prepare_texts(texts)
         encoded = self.tokenizer.batch_encode(cleaned, max_length=self.max_length)
-        input_ids: torch.Tensor = encoded["input_ids"]
-        attention_mask: torch.Tensor = encoded["attention_mask"].to(dtype=torch.bool)
         lengths = attention_mask.sum(dim=1).tolist()
         return Batch(input_ids=input_ids, attention_mask=attention_mask, lengths=lengths)
     def __call__(self, texts: Sequence[str]) -> Batch:
         return self.batch_encode(texts)

+"""
+Text preprocessing for LexiMind.
+Lightweight text cleaning and tokenization pipeline for model input preparation.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations
 from dataclasses import dataclass, replace
+from typing import List, Sequence
 import torch
 from .tokenization import Tokenizer, TokenizerConfig
+# --------------- Text Cleaning ---------------
+class TextCleaner:
+    """Basic text normalization."""
+    def __init__(self, lowercase: bool = True) -> None:
         self.lowercase = lowercase
+    def clean(self, text: str) -> str:
+        """Strip, normalize whitespace, optionally lowercase."""
+        text = text.strip()
+        if self.lowercase:
+            text = text.lower()
+        return " ".join(text.split())
+    def clean_batch(self, texts: Sequence[str]) -> List[str]:
+        """Clean multiple texts."""
+        return [self.clean(t) for t in texts]
+    # Backwards compatibility alias
+    def transform(self, texts: Sequence[str]) -> List[str]:
+        """Alias for clean_batch (sklearn-style interface)."""
+        return self.clean_batch(texts)
+# --------------- Batch Output ---------------
 @dataclass
 class Batch:
+    """Tokenized batch ready for model consumption."""
     input_ids: torch.Tensor
     attention_mask: torch.Tensor
     lengths: List[int]
+# --------------- Preprocessor ---------------
+class TextPreprocessor:
+    """Combines text cleaning with tokenization."""
     def __init__(
         self,
         tokenizer_name: str = "google/flan-t5-base",
         max_length: int | None = None,
         lowercase: bool = True,
     ) -> None:
+        self.cleaner = TextCleaner(lowercase=lowercase)
+        # Initialize or validate tokenizer
         if tokenizer is None:
             cfg = tokenizer_config or TokenizerConfig(pretrained_model_name=tokenizer_name)
             if max_length is not None:
             self.tokenizer = tokenizer
             if max_length is not None and max_length != tokenizer.config.max_length:
                 raise ValueError(
+                    "max_length conflicts with tokenizer config - "
+                    "initialize tokenizer with desired settings"
                 )
         self.max_length = max_length or self.tokenizer.config.max_length
     def clean_text(self, text: str) -> str:
+        """Clean a single text."""
+        return self.cleaner.clean(text)
     def batch_encode(self, texts: Sequence[str]) -> Batch:
+        """Clean and tokenize texts into a batch."""
+        cleaned = self.cleaner.clean_batch(texts)
         encoded = self.tokenizer.batch_encode(cleaned, max_length=self.max_length)
+        input_ids = encoded["input_ids"]
+        attention_mask = encoded["attention_mask"].to(dtype=torch.bool)
         lengths = attention_mask.sum(dim=1).tolist()
         return Batch(input_ids=input_ids, attention_mask=attention_mask, lengths=lengths)
     def __call__(self, texts: Sequence[str]) -> Batch:
+        """Alias for batch_encode."""
         return self.batch_encode(texts)
+# --------------- Backwards Compatibility ---------------
+# Keep old name for any imports
+BasicTextCleaner = TextCleaner

src/data/tokenization.py CHANGED Viewed

@@ -1,4 +1,13 @@
-"""Tokenizer wrapper around HuggingFace models used across LexiMind."""
 from __future__ import annotations

+"""
+Tokenizer facade for LexiMind.
+Wraps HuggingFace tokenizers with a simplified interface that handles
+special token management, batch encoding, and T5-specific conventions
+for decoder input preparation.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/inference/factory.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Helpers to assemble an inference pipeline from saved artifacts."""
 from __future__ import annotations

+"""
+Inference pipeline factory for LexiMind.
+Assembles a complete inference pipeline from saved checkpoints, tokenizer
+artifacts, and label metadata. Handles model loading and configuration.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/inference/pipeline.py CHANGED Viewed

@@ -1,9 +1,17 @@
-"""Inference helpers for multitask LexiMind models."""
 from __future__ import annotations
 from dataclasses import dataclass, fields, replace
-from typing import Any, Iterable, List, Sequence, cast
 import torch
 import torch.nn.functional as F
@@ -11,10 +19,12 @@ import torch.nn.functional as F
 from ..data.preprocessing import Batch, TextPreprocessor
 from ..data.tokenization import Tokenizer
 @dataclass
 class InferenceConfig:
-    """Configuration knobs for the inference pipeline."""
     summary_max_length: int = 128
     emotion_threshold: float = 0.5
@@ -33,8 +43,11 @@ class TopicPrediction:
     confidence: float
 class InferencePipeline:
-    """Run summarization, emotion, and topic heads through a unified interface."""
     def __init__(
         self,
@@ -50,50 +63,49 @@ class InferencePipeline:
         self.model = model
         self.tokenizer = tokenizer
         self.config = config or InferenceConfig()
-        chosen_device = device or self.config.device
-        if chosen_device is None:
-            first_param = next(model.parameters(), None)
-            chosen_device = first_param.device if first_param is not None else "cpu"
-        self.device = torch.device(chosen_device)
         self.model.to(self.device)
         self.model.eval()
         self.preprocessor = preprocessor or TextPreprocessor(tokenizer=tokenizer)
-        self.emotion_labels = list(emotion_labels) if emotion_labels is not None else None
-        self.topic_labels = list(topic_labels) if topic_labels is not None else None
     def summarize(self, texts: Sequence[str], *, max_length: int | None = None) -> List[str]:
         if not texts:
             return []
-        batch = self._batch_to_device(self.preprocessor.batch_encode(texts))
         src_ids = batch.input_ids
         src_mask = batch.attention_mask
         max_len = max_length or self.config.summary_max_length
-        if not hasattr(self.model, "encoder") or not hasattr(self.model, "decoder"):
-            raise RuntimeError(
-                "Model must expose encoder and decoder attributes for summarization."
-            )
-        # Cast to Any to allow access to dynamic attributes encoder and decoder
         model = cast(Any, self.model)
         with torch.inference_mode():
-            encoder_mask = (
                 src_mask.unsqueeze(1) & src_mask.unsqueeze(2) if src_mask is not None else None
             )
-            memory = model.encoder(src_ids, mask=encoder_mask)
-            min_len = 10
-            # Ban BOS, PAD, UNK from being generated
-            ban_token_ids = [
-                self.tokenizer.bos_token_id,
-                self.tokenizer.pad_token_id,
-            ]
-            unk_id = getattr(self.tokenizer._tokenizer, "unk_token_id", None)
-            if isinstance(unk_id, int):
-                ban_token_ids.append(unk_id)
-            ban_token_ids = [tid for tid in ban_token_ids if tid is not None]
             generated = model.decoder.greedy_decode(
                 memory=memory,
@@ -101,16 +113,15 @@ class InferencePipeline:
                 start_token_id=self.tokenizer.bos_token_id,
                 end_token_id=self.tokenizer.eos_token_id,
                 device=self.device,
-                min_len=min_len,
-                ban_token_ids=ban_token_ids,
                 no_repeat_ngram_size=3,
                 memory_mask=src_mask,
             )
-            decoded_list = self.tokenizer.decode_batch(generated.tolist())
-            final_summaries = decoded_list
-        return final_summaries
     def predict_emotions(
         self,
@@ -118,78 +129,91 @@ class InferencePipeline:
         *,
         threshold: float | None = None,
     ) -> List[EmotionPrediction]:
         if not texts:
             return []
-        if self.emotion_labels is None or not self.emotion_labels:
-            raise RuntimeError("emotion_labels must be provided to decode emotion predictions")
-        batch = self._batch_to_device(self.preprocessor.batch_encode(texts))
-        model_inputs = self._batch_to_model_inputs(batch)
-        decision_threshold = threshold or self.config.emotion_threshold
         with torch.inference_mode():
-            logits = self.model.forward("emotion", model_inputs)
             probs = torch.sigmoid(logits)
-        predictions: List[EmotionPrediction] = []
         for row in probs.cpu():
             pairs = [
                 (label, score)
                 for label, score in zip(self.emotion_labels, row.tolist(), strict=False)
-                if score >= decision_threshold
             ]
-            labels = [label for label, _ in pairs]
-            scores = [score for _, score in pairs]
-            predictions.append(EmotionPrediction(labels=labels, scores=scores))
-        return predictions
     def predict_topics(self, texts: Sequence[str]) -> List[TopicPrediction]:
         if not texts:
             return []
-        if self.topic_labels is None or not self.topic_labels:
-            raise RuntimeError("topic_labels must be provided to decode topic predictions")
-        batch = self._batch_to_device(self.preprocessor.batch_encode(texts))
-        model_inputs = self._batch_to_model_inputs(batch)
         with torch.inference_mode():
-            logits = self.model.forward("topic", model_inputs)
             probs = F.softmax(logits, dim=-1)
-        results: List[TopicPrediction] = []
         for row in probs.cpu():
-            scores = row.tolist()
-            best_index = int(row.argmax().item())
             results.append(
-                TopicPrediction(label=self.topic_labels[best_index], confidence=scores[best_index])
             )
         return results
-    def batch_predict(self, texts: Iterable[str]) -> dict[str, object]:
         text_list = list(texts)
-        if self.emotion_labels is None or not self.emotion_labels:
-            raise RuntimeError("emotion_labels must be provided for batch predictions")
-        if self.topic_labels is None or not self.topic_labels:
-            raise RuntimeError("topic_labels must be provided for batch predictions")
         return {
             "summaries": self.summarize(text_list),
             "emotion": self.predict_emotions(text_list),
             "topic": self.predict_topics(text_list),
         }
-    def _batch_to_device(self, batch: Batch) -> Batch:
-        tensor_updates: dict[str, torch.Tensor] = {}
-        for item in fields(batch):
-            value = getattr(batch, item.name)
-            if torch.is_tensor(value):
-                tensor_updates[item.name] = value.to(self.device)
-        if not tensor_updates:
-            return batch
-        return replace(batch, **tensor_updates)
     @staticmethod
-    def _batch_to_model_inputs(batch: Batch) -> dict[str, torch.Tensor]:
-        inputs: dict[str, torch.Tensor] = {"input_ids": batch.input_ids}
         if batch.attention_mask is not None:
             inputs["attention_mask"] = batch.attention_mask
         return inputs

+"""
+Inference pipeline for LexiMind.
+Unified interface for summarization, emotion detection, and topic classification
+with batched processing and device management.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations
 from dataclasses import dataclass, fields, replace
+from typing import Any, Dict, List, Sequence, cast
 import torch
 import torch.nn.functional as F
 from ..data.preprocessing import Batch, TextPreprocessor
 from ..data.tokenization import Tokenizer
+# --------------- Configuration ---------------
 @dataclass
 class InferenceConfig:
+    """Pipeline settings."""
     summary_max_length: int = 128
     emotion_threshold: float = 0.5
     confidence: float
+# --------------- Pipeline ---------------
 class InferencePipeline:
+    """Multi-task inference with batched processing."""
     def __init__(
         self,
         self.model = model
         self.tokenizer = tokenizer
         self.config = config or InferenceConfig()
+        # Resolve device
+        chosen = device or self.config.device
+        if chosen is None:
+            param = next(model.parameters(), None)
+            chosen = param.device if param else "cpu"
+        self.device = torch.device(chosen)
         self.model.to(self.device)
         self.model.eval()
         self.preprocessor = preprocessor or TextPreprocessor(tokenizer=tokenizer)
+        self.emotion_labels = list(emotion_labels) if emotion_labels else None
+        self.topic_labels = list(topic_labels) if topic_labels else None
+    # --------------- Summarization ---------------
     def summarize(self, texts: Sequence[str], *, max_length: int | None = None) -> List[str]:
+        """Generate summaries for input texts."""
         if not texts:
             return []
+        batch = self._to_device(self.preprocessor.batch_encode(texts))
         src_ids = batch.input_ids
         src_mask = batch.attention_mask
         max_len = max_length or self.config.summary_max_length
         model = cast(Any, self.model)
+        if not hasattr(model, "encoder") or not hasattr(model, "decoder"):
+            raise RuntimeError("Model must have encoder and decoder for summarization")
         with torch.inference_mode():
+            # Encode
+            enc_mask = (
                 src_mask.unsqueeze(1) & src_mask.unsqueeze(2) if src_mask is not None else None
             )
+            memory = model.encoder(src_ids, mask=enc_mask)
+            # Decode with constraints to improve quality
+            ban_ids = [self.tokenizer.bos_token_id, self.tokenizer.pad_token_id]
+            unk = getattr(self.tokenizer._tokenizer, "unk_token_id", None)
+            if isinstance(unk, int):
+                ban_ids.append(unk)
             generated = model.decoder.greedy_decode(
                 memory=memory,
                 start_token_id=self.tokenizer.bos_token_id,
                 end_token_id=self.tokenizer.eos_token_id,
                 device=self.device,
+                min_len=10,
+                ban_token_ids=[i for i in ban_ids if i is not None],
                 no_repeat_ngram_size=3,
                 memory_mask=src_mask,
             )
+        return self.tokenizer.decode_batch(generated.tolist())
+    # --------------- Emotion ---------------
     def predict_emotions(
         self,
         *,
         threshold: float | None = None,
     ) -> List[EmotionPrediction]:
+        """Predict emotions for input texts."""
         if not texts:
             return []
+        if not self.emotion_labels:
+            raise RuntimeError("emotion_labels required for emotion prediction")
+        batch = self._to_device(self.preprocessor.batch_encode(texts))
+        inputs = self._model_inputs(batch)
+        thresh = threshold or self.config.emotion_threshold
         with torch.inference_mode():
+            logits = self.model.forward("emotion", inputs)
             probs = torch.sigmoid(logits)
+        results = []
         for row in probs.cpu():
             pairs = [
                 (label, score)
                 for label, score in zip(self.emotion_labels, row.tolist(), strict=False)
+                if score >= thresh
             ]
+            results.append(
+                EmotionPrediction(
+                    labels=[label for label, _ in pairs],
+                    scores=[score for _, score in pairs],
+                )
+            )
+        return results
+    # --------------- Topic ---------------
     def predict_topics(self, texts: Sequence[str]) -> List[TopicPrediction]:
+        """Predict topic for input texts."""
         if not texts:
             return []
+        if not self.topic_labels:
+            raise RuntimeError("topic_labels required for topic prediction")
+        batch = self._to_device(self.preprocessor.batch_encode(texts))
+        inputs = self._model_inputs(batch)
         with torch.inference_mode():
+            logits = self.model.forward("topic", inputs)
             probs = F.softmax(logits, dim=-1)
+        results = []
         for row in probs.cpu():
+            idx = int(row.argmax().item())
             results.append(
+                TopicPrediction(
+                    label=self.topic_labels[idx],
+                    confidence=row[idx].item(),
+                )
             )
         return results
+    # --------------- Batch Prediction ---------------
+    def batch_predict(self, texts: Sequence[str]) -> Dict[str, Any]:
+        """Run all three tasks on input texts."""
+        if not self.emotion_labels or not self.topic_labels:
+            raise RuntimeError("Both emotion_labels and topic_labels required")
         text_list = list(texts)
         return {
             "summaries": self.summarize(text_list),
             "emotion": self.predict_emotions(text_list),
             "topic": self.predict_topics(text_list),
         }
+    # --------------- Helpers ---------------
+    def _to_device(self, batch: Batch) -> Batch:
+        """Move batch tensors to device with non_blocking for speed."""
+        updates = {}
+        for f in fields(batch):
+            val = getattr(batch, f.name)
+            if torch.is_tensor(val):
+                updates[f.name] = val.to(self.device, non_blocking=True)
+        return replace(batch, **updates) if updates else batch
     @staticmethod
+    def _model_inputs(batch: Batch) -> Dict[str, torch.Tensor]:
+        """Extract model inputs from batch."""
+        inputs = {"input_ids": batch.input_ids}
         if batch.attention_mask is not None:
             inputs["attention_mask"] = batch.attention_mask
         return inputs

src/inference/postprocessing.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""Output cleaning helpers."""
 from typing import List

+"""
+Output postprocessing utilities for LexiMind.
+Provides text cleaning helpers for model outputs.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from typing import List

src/models/decoder.py CHANGED Viewed

@@ -1,16 +1,17 @@
-"""
-Transformer Decoder (Pre-LN) - implementation.
-Implements:
-- create_causal_mask
-- TransformerDecoderLayer
-- TransformerDecoder (stack + naive greedy decoding)
-Conventions:
-- Masks are boolean: True = allowed, False = masked.
-- MultiHeadAttention expects masks broadcastable to (B, num_heads, T_q, T_k).
-- This decoder uses Pre-LN (RMSNorm before each sublayer).
-- RMSNorm is just simpler than LayerNorm and more computationally efficient, it's become the modern convention. These reasons are why I used it here.
 """
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union

+"""Transformer Decoder implementation (Pre-LN).
+This module implements the decoder component of the Transformer architecture:
+- create_causal_mask: Generate causal attention masks
+- TransformerDecoderLayer: Single decoder block with self-attn + cross-attn + FFN
+- TransformerDecoder: Full stack with embeddings, positional encoding, and generation
+Design notes:
+- Pre-LN with RMSNorm for training stability
+- Masks are boolean: True = attend, False = mask
+- Supports T5-style relative position bias
+Author: Oliver Perrin
+Date: 2025-10-23
 """
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union

src/models/encoder.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Transformer encoder implementation (Pre-LN).
-Contains:
-- TransformerEncoderLayer: one encoder block (self-attention + FFN with residuals + LayerNorm (RMSNorm - modern convention))
-- TransformerEncoder: embedding + positional encoding + stack of encoder layers
-Design choices:
-- Pre-LN (RMSNorm before each sublayer) for stable training.
-- The FeedForward module is position-wise and does NOT include residuals or normalization.
-- MultiHeadAttention handles mask broadcasting from (B, S, S) -> (B, 1, S, S) internally.
-- The encoder accepts either token ids (LongTensor) or precomputed embeddings (FloatTensor).
-  If you pass token ids, provide vocab_size when constructing the encoder and optionally pad_token_id.
-- Optionally collect attention weights by passing collect_attn=True to forward().
 """
 from typing import List, Literal, Optional, Tuple, Union
@@ -213,9 +212,9 @@ class TransformerEncoder(nn.Module):
         Build a 3D attention mask (batch, seq, seq) from input_ids and pad_token_id.
         True indicates valid positions; False indicates masked (pad).
         """
-        assert (
-            self.pad_token_id is not None
-        ), "pad_token_id must be set to build padding mask from ids."
         # mask shape: (batch, seq) where True = token kept (non-pad)
         pad_mask = input_ids != self.pad_token_id
         # Convert to (batch, seq_q, seq_k) by outer product broadcasting

+"""Transformer Encoder implementation (Pre-LN).
+This module implements the encoder component of the Transformer architecture:
+- TransformerEncoderLayer: Single encoder block with self-attention + FFN
+- TransformerEncoder: Full stack with embeddings and positional encoding
+Design notes:
+- Pre-LN with RMSNorm for training stability
+- Masks are boolean: True = attend, False = mask
+- Supports T5-style relative position bias
+Author: Oliver Perrin
+Date: 2025-10-23
 """
 from typing import List, Literal, Optional, Tuple, Union
         Build a 3D attention mask (batch, seq, seq) from input_ids and pad_token_id.
         True indicates valid positions; False indicates masked (pad).
         """
+        assert self.pad_token_id is not None, (
+            "pad_token_id must be set to build padding mask from ids."
+        )
         # mask shape: (batch, seq) where True = token kept (non-pad)
         pad_mask = input_ids != self.pad_token_id
         # Convert to (batch, seq_q, seq_k) by outer product broadcasting

src/models/factory.py CHANGED Viewed

@@ -1,4 +1,14 @@
-"""Factory helpers to assemble multitask models for inference/training."""
 from __future__ import annotations

+"""Factory helpers to assemble multitask models.
+This module provides model construction and weight loading utilities:
+- ModelConfig: Dataclass for architecture hyperparameters
+- load_model_config: Load configuration from YAML
+- build_multitask_model: Construct full model with task heads
+- Weight loading: Transfer pretrained T5/FLAN-T5 or LLaMA weights
+Author: Oliver Perrin
+Date: 2025-10-23
+"""
 from __future__ import annotations

src/models/feedforward.py CHANGED Viewed

@@ -1,5 +1,11 @@
-"""
-Position-wise Feed-Forward Network.
 """
 from typing import Literal, Optional

+"""Position-wise Feed-Forward Network.
+This module implements the FFN sublayer used in Transformer blocks:
+- Standard FFN: Two linear layers with activation (GELU/ReLU)
+- Gated FFN: SwiGLU (LLaMA-style) or Gated-GELU (T5/FLAN-T5 style)
+Author: Oliver Perrin
+Date: 2025-10-23
 """
 from typing import Literal, Optional

src/models/heads.py CHANGED Viewed

@@ -1,13 +1,13 @@
-"""
-Prediction heads for Transformer models.
-Includes:
-- ClassificationHead: sequence-level classification with simple pooling (mean/cls/max).
-- TokenClassificationHead: per-token classification (e.g., NER).
-- LMHead: language-modeling head mapping hidden states to vocabulary logits. Optional weight tying to an Embedding.
-- ProjectionHead: small projection MLP for representation learning / contrastive heads.
-Keep these heads minimal, well-tested, and easy to compose on top of encoder/decoder outputs.
 """
 from typing import Literal, Optional
@@ -117,12 +117,12 @@ class LMHead(nn.Module):
         if tie_embedding is not None:
             # Validate sizes
-            assert (
-                tie_embedding.num_embeddings == vocab_size
-            ), "vocab size mismatch for weight tying"
-            assert (
-                tie_embedding.embedding_dim == d_model
-            ), "embedding dim must match d_model for weight tying"
             # Tie weights: point the projection weight to the embedding weight Tensor
             # Remove the existing projection parameter in favor of the embedding weight
             # This keeps the same Parameter object, so updates affect both modules.

+"""Prediction heads for Transformer models.
+This module provides task-specific output heads:
+- ClassificationHead: Sequence-level classification with pooling (mean/cls/max)
+- TokenClassificationHead: Per-token classification (NER, POS tagging)
+- LMHead: Language modeling logits with optional weight tying
+- ProjectionHead: MLP for representation learning / contrastive tasks
+Author: Oliver Perrin
+Date: 2025-10-23
 """
 from typing import Literal, Optional
         if tie_embedding is not None:
             # Validate sizes
+            assert tie_embedding.num_embeddings == vocab_size, (
+                "vocab size mismatch for weight tying"
+            )
+            assert tie_embedding.embedding_dim == d_model, (
+                "embedding dim must match d_model for weight tying"
+            )
             # Tie weights: point the projection weight to the embedding weight Tensor
             # Remove the existing projection parameter in favor of the embedding weight
             # This keeps the same Parameter object, so updates affect both modules.

src/models/multitask.py CHANGED Viewed

@@ -1,18 +1,12 @@
-"""
-Multitask model composition utilities.
-Provides:
-- MultiTaskModel: lightweight wrapper to compose an encoder and/or decoder with
-  multiple task heads (classification, token classification, LM head, etc.)
-- add_head / remove_head helpers
-- forward(task_name, ...) that routes inputs to the correct sub-modules
-- compute_loss helper that uses common losses and ignore_index support
-Design goals:
-- Keep composition simple and explicit (use named heads per task)
-- Support encoder-only tasks (classification, token classification) and
-  seq2seq tasks (encoder -> decoder -> LMHead)
-- Minimal dependencies on training loop; return logits and (optionally) loss
 """
 from typing import Any, Dict, Optional

+"""Multitask model composition utilities.
+This module provides infrastructure for multi-task learning:
+- MultiTaskModel: Compose encoder/decoder with multiple task heads
+- Routing: forward(task_name, ...) dispatches to correct components
+- Loss computation: Built-in cross-entropy with ignore_index support
+Author: Oliver Perrin
+Date: 2025-10-23
 """
 from typing import Any, Dict, Optional

src/models/positional_encoding.py CHANGED Viewed

@@ -1,10 +1,12 @@
-# src/models/positional_encoding.py
 """
 Positional Encoding for Transformer models.
-Injects information about the position of tokens in a sequence, since
-self-attention has no inherent notion of token order.
 """
 import math

 """
 Positional Encoding for Transformer models.
+Provides sinusoidal position embeddings that inject sequential order information
+into token representations. Required because self-attention is permutation-invariant
+and has no inherent notion of token position.
+Author: Oliver Perrin
+Date: December 2025
 """
 import math

src/training/metrics.py CHANGED Viewed

@@ -1,4 +1,13 @@
-"""Metric helpers used during training and evaluation."""
 from __future__ import annotations

+"""
+Training and evaluation metrics for LexiMind.
+Provides metric computation utilities for all task types: accuracy for topic
+classification, multi-label F1 for emotion detection, and ROUGE/BLEU for
+summarization quality assessment.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/training/trainer.py CHANGED Viewed

@@ -1,38 +1,50 @@
-"""Multi-task trainer coordinating summarization, emotion, and topic heads."""
 from __future__ import annotations
-import shutil
 import time
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Callable, Dict, Iterator, List
 import mlflow
 import torch
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 from ..data.tokenization import Tokenizer
 from .metrics import accuracy, multilabel_f1, rouge_like
 @dataclass
 class TrainerConfig:
     max_epochs: int = 1
     gradient_clip_norm: float = 1.0
-    logging_interval: int = 50
     task_weights: Dict[str, float] | None = None
     validation_samples: int = 3
     validation_max_length: int = 128
-    label_smoothing: float = 0.0  # Label smoothing for regularization (e.g., 0.1)
     experiment_name: str = "LexiMind"
     run_name: str | None = None
     gradient_accumulation_steps: int = 1
 class Trainer:
-    """Coordinates multi-task optimisation across task-specific dataloaders."""
     def __init__(
         self,
@@ -47,392 +59,315 @@ class Trainer:
         self.config = config
         self.device = device
         self.tokenizer = tokenizer
         self.emotion_loss = torch.nn.BCEWithLogitsLoss()
         self.topic_loss = torch.nn.CrossEntropyLoss()
-        # Apply label smoothing to summarization task if configured
-        self.label_smoothing = config.label_smoothing
-        self._progress_last_len = 0
-        self.gradient_accumulation_steps = max(1, config.gradient_accumulation_steps)
-        self._nan_counter = 0  # Track consecutive NaNs
-        # Mixed Precision Training
-        # Initialize GradScaler for float16/bfloat16 training
-        # This scales gradients to prevent underflow during backward pass
-        # Note: bfloat16 generally doesn't need scaling, but we keep it for safety unless it causes NaNs
-        self.scaler = torch.GradScaler("cuda", enabled=(device.type == "cuda"))
-        # Initialize MLflow
         mlflow.set_experiment(config.experiment_name)
     def fit(
         self,
         train_loaders: Dict[str, DataLoader],
         val_loaders: Dict[str, DataLoader] | None = None,
         checkpoint_callback: Callable | None = None,
     ) -> Dict[str, Dict[str, float]]:
-        """Train the model.
-        Args:
-            train_loaders: Task-specific training dataloaders
-            val_loaders: Optional task-specific validation dataloaders
-            checkpoint_callback: Optional callback(epoch, model, history) to save checkpoints
-        Returns:
-            Training history dictionary
-        """
         history: Dict[str, Dict[str, float]] = {}
-        total_epochs = max(1, self.config.max_epochs)
-        start_time = time.perf_counter()
         with mlflow.start_run(run_name=self.config.run_name):
-            # Log configuration
-            mlflow.log_params(
-                {
-                    "max_epochs": self.config.max_epochs,
-                    "gradient_clip_norm": self.config.gradient_clip_norm,
-                    "label_smoothing": self.config.label_smoothing,
-                    "task_weights": str(self.config.task_weights),
-                    "device": str(self.device),
-                }
             )
-            for epoch in range(1, total_epochs + 1):
                 epoch_start = time.perf_counter()
-                train_metrics = self._run_epoch(
-                    train_loaders,
-                    train=True,
-                    epoch=epoch,
-                    total_epochs=total_epochs,
-                    epoch_start=epoch_start,
-                    global_start=start_time,
-                )
-                history[f"train_epoch_{epoch}"] = train_metrics
-                # Log training metrics to MLflow
-                for k, v in train_metrics.items():
-                    if k != "epoch":
-                        mlflow.log_metric(f"train_{k}", v, step=epoch)
                 if val_loaders:
                     val_metrics = self._run_epoch(val_loaders, train=False, epoch=epoch)
                     history[f"val_epoch_{epoch}"] = val_metrics
-                    # Log validation metrics to MLflow
-                    for k, v in val_metrics.items():
-                        if k != "epoch":
-                            mlflow.log_metric(f"val_{k}", v, step=epoch)
-                    # Generate sample summaries for manual quality assessment
                     if "summarization" in val_loaders:
                         self._validate_generation(val_loaders["summarization"], epoch)
-                # Save checkpoint after each epoch
-                if checkpoint_callback is not None:
                     checkpoint_callback(epoch, self.model, history)
-                epoch_duration = time.perf_counter() - epoch_start
-                total_elapsed = time.perf_counter() - start_time
-                self._print_epoch_progress(epoch, total_epochs, epoch_duration, total_elapsed)
         return history
     def _run_epoch(
         self,
         loaders: Dict[str, DataLoader],
         *,
         train: bool,
         epoch: int,
-        total_epochs: int | None = None,
-        epoch_start: float | None = None,
-        global_start: float | None = None,
     ) -> Dict[str, float]:
-        phase = "train" if train else "eval"
         self.model.train(train)
-        metrics_accumulator: Dict[str, list[float]] = defaultdict(list)
-        iterator_map: Dict[str, Iterator[Dict[str, torch.Tensor]]] = {
-            task: iter(loader) for task, loader in loaders.items()
-        }
         max_batches = max(len(loader) for loader in loaders.values())
-        progress_enabled = (
-            train
-            and max_batches > 0
-            and total_epochs is not None
-            and epoch_start is not None
-            and global_start is not None
         )
-        def emit_progress(step: int, final: bool = False) -> None:
-            if not progress_enabled:
-                return
-            total_epochs_value = total_epochs
-            epoch_start_value = epoch_start
-            global_start_value = global_start
-            assert total_epochs_value is not None
-            assert epoch_start_value is not None
-            assert global_start_value is not None
-            self._update_epoch_progress(
-                epoch=epoch,
-                total_epochs=total_epochs_value,
-                step=step,
-                total_steps=max_batches,
-                epoch_start=epoch_start_value,
-                global_start=global_start_value,
-                final=final,
-            )
-        emit_progress(0)
         context = torch.enable_grad() if train else torch.no_grad()
         with context:
-            for step in range(max_batches):
-                # Mark step begin for CUDA Graphs (inductor) to handle memory reuse correctly
-                if (
-                    train
-                    and self.device.type == "cuda"
-                    and hasattr(torch.compiler, "cudagraph_mark_step_begin")
-                ):
-                    torch.compiler.cudagraph_mark_step_begin()
-                backward_performed = False
-                step_total_loss = 0.0
-                # Mixed Precision Context
-                # Using bfloat16 for my RTX 4070 (Ampere/Ada) - better stability than float16
-                # Disable scaler for bfloat16 to prevent NaNs
-                use_bfloat16 = self.device.type == "cuda" and torch.cuda.is_bf16_supported()
                 for task, loader in loaders.items():
-                    batch = self._next_batch(iterator_map, loader, task)
                     if batch is None:
                         continue
-                    with torch.autocast(
-                        "cuda",
-                        dtype=torch.bfloat16 if use_bfloat16 else torch.float16,
-                        enabled=(self.device.type == "cuda"),
-                    ):
-                        loss, task_metrics = self._forward_task(task, batch, train)
                     if torch.isnan(loss):
-                        if train:
-                            self._nan_counter += 1
-                            print(
-                                f"Warning: NaN loss detected for task '{task}'. Skipping update for this task. (Consecutive NaNs: {self._nan_counter})"
-                            )
-                            if self._nan_counter > 10:
-                                raise RuntimeError(
-                                    "Too many consecutive NaN losses. Training is diverging."
-                                )
                         continue
-                    else:
-                        if train:
-                            self._nan_counter = 0
-                    weight = self._task_weight(task)
-                    # Scale loss by gradient accumulation steps
-                    weighted_loss = (loss * weight) / self.gradient_accumulation_steps
-                    step_total_loss += weighted_loss.item() * self.gradient_accumulation_steps
-                    metrics_accumulator[f"{task}_loss"].append(loss.item())
-                    for metric_name, metric_value in task_metrics.items():
-                        metrics_accumulator[f"{task}_{metric_name}"].append(metric_value)
                     if train:
-                        # Scale loss before backward to prevent underflow
-                        # We accumulate gradients from all tasks before stepping the optimizer
-                        # This effectively minimizes the weighted sum of losses: L_total = w1*L1 + w2*L2 + ...
-                        if use_bfloat16:
-                            # bfloat16 doesn't need scaling and it can cause NaNs
-                            weighted_loss.backward()
                         else:
-                            self.scaler.scale(weighted_loss).backward()
-                        backward_performed = True
-                if backward_performed:
-                    metrics_accumulator["total_loss"].append(step_total_loss)
-                # Perform optimizer step only after accumulating enough gradients
-                if (
-                    train
-                    and backward_performed
-                    and (step + 1) % self.gradient_accumulation_steps == 0
-                ):
-                    # Unscale gradients before clipping
-                    if use_bfloat16:
-                        torch.nn.utils.clip_grad_norm_(
-                            self.model.parameters(), self.config.gradient_clip_norm
-                        )
-                        self.optimizer.step()
-                        self.optimizer.zero_grad()
-                    else:
-                        self.scaler.unscale_(self.optimizer)
-                        torch.nn.utils.clip_grad_norm_(
-                            self.model.parameters(), self.config.gradient_clip_norm
-                        )
-                        # Step optimizer using scaler
-                        self.scaler.step(self.optimizer)
-                        self.scaler.update()
-                        self.optimizer.zero_grad()
-                if (
-                    train
-                    and self.config.logging_interval
-                    and (step + 1) % self.config.logging_interval == 0
-                ):
-                    if torch.cuda.is_available() and self.device.type == "cuda":
-                        torch.cuda.empty_cache()
-                emit_progress(step + 1)
-        emit_progress(max_batches, final=True)
-        averaged = {
-            name: sum(values) / len(values)
-            for name, values in metrics_accumulator.items()
-            if values
-        }
         averaged["epoch"] = float(epoch)
-        metric_str = ", ".join(f"{k}={v:.4f}" for k, v in averaged.items() if k != "epoch")
-        print(f"[{phase}] epoch {epoch}: {metric_str}")
         return averaged
-    def _next_batch(
-        self,
-        iterator_map: Dict[str, Iterator[Dict[str, torch.Tensor]]],
-        loader: DataLoader,
-        task: str,
     ) -> Dict[str, torch.Tensor] | None:
         try:
-            batch = next(iterator_map[task])
         except StopIteration:
-            iterator_map[task] = iter(loader)
             try:
-                batch = next(iterator_map[task])
             except StopIteration:
                 return None
         return {
-            key: value.to(self.device) if isinstance(value, torch.Tensor) else value
-            for key, value in batch.items()
         }
     def _forward_task(
-        self, task: str, batch: Dict[str, torch.Tensor], train: bool
     ) -> tuple[torch.Tensor, Dict[str, float]]:
         if task == "summarization":
-            summarization_inputs = {
-                "src_ids": batch["src_ids"],
-                "tgt_ids": batch["tgt_ids"],
-            }
-            if "src_mask" in batch:
-                summarization_inputs["src_mask"] = batch["src_mask"]
-            logits = self.model.forward("summarization", summarization_inputs)
-            vocab_size = logits.size(-1)
-            # Apply label smoothing for regularization - prevents overconfident predictions
-            loss = F.cross_entropy(
-                logits.view(-1, vocab_size),
-                batch["labels"].view(-1),
-                ignore_index=-100,
-                label_smoothing=self.label_smoothing,
-            )
-            summaries = self._decode_predictions(logits)
-            references = self._decode_labels(batch["labels"])
-            rouge = rouge_like(summaries, references)
-            return loss, {"rouge_like": rouge}
-        if task == "emotion":
-            emotion_inputs = {"input_ids": batch["input_ids"]}
-            if "attention_mask" in batch:
-                emotion_inputs["attention_mask"] = batch["attention_mask"]
-            logits = self.model.forward("emotion", emotion_inputs)
-            loss = self.emotion_loss(logits, batch["labels"].float())
-            probs = torch.sigmoid(logits)
-            preds = (probs > 0.5).int()
-            labels = batch["labels"].int()
-            f1 = multilabel_f1(preds, labels)
-            return loss, {"f1": f1}
-        if task == "topic":
-            topic_inputs = {"input_ids": batch["input_ids"]}
-            if "attention_mask" in batch:
-                topic_inputs["attention_mask"] = batch["attention_mask"]
-            logits = self.model.forward("topic", topic_inputs)
-            loss = self.topic_loss(logits, batch["labels"])
-            preds = logits.argmax(dim=-1)
-            acc = accuracy(preds.tolist(), batch["labels"].tolist())
-            return loss, {"accuracy": acc}
-        raise ValueError(f"Unknown task '{task}'")
-    def _task_weight(self, task: str) -> float:
-        if not self.config.task_weights:
-            return 1.0
-        return self.config.task_weights.get(task, 1.0)
-    def _decode_predictions(self, logits: torch.Tensor) -> List[str]:
-        generated = logits.argmax(dim=-1)
-        return self.tokenizer.decode_batch(generated.tolist())
     def _decode_labels(self, labels: torch.Tensor) -> List[str]:
         valid = labels.clone()
         valid[valid == -100] = self.tokenizer.pad_token_id
         return self.tokenizer.decode_batch(valid.tolist())
     def _validate_generation(self, val_loader: DataLoader, epoch: int) -> None:
-        """Generate and print sample summaries to monitor quality during training."""
         self.model.eval()
-        samples_generated = 0
-        print(f"\n{'=' * 80}")
-        print(f"[Validation Generation - Epoch {epoch}]")
-        print(f"{'=' * 80}")
         with torch.no_grad():
-            for batch in val_loader:
-                if samples_generated >= self.config.validation_samples:
                     break
                 batch = {
                     k: v.to(self.device) if isinstance(v, torch.Tensor) else v
                     for k, v in batch.items()
                 }
-                src_ids = batch["src_ids"]
                 src_mask = batch.get("src_mask")
-                labels = batch["labels"]
-                # Only process first item from batch
-                src_ids = src_ids[:1]
                 if src_mask is not None:
                     src_mask = src_mask[:1]
-                labels = labels[:1]
-                # Encode source
-                encoder_mask = None
-                if src_mask is not None:
-                    encoder_mask = src_mask.unsqueeze(1) & src_mask.unsqueeze(2)
-                memory = self.model.encoder(src_ids, mask=encoder_mask)
-                # DEBUG: Check encoder output statistics
-                if samples_generated == 0:
-                    print("\n[DEBUG] Encoder output stats:")
-                    print(f"  Shape: {memory.shape}")
-                    print(f"  Mean: {memory.mean().item():.6f}")
-                    print(f"  Std: {memory.std().item():.6f}")
-                    print(f"  Min: {memory.min().item():.6f}")
-                    print(f"  Max: {memory.max().item():.6f}")
-                    print(f"  Has NaN: {torch.isnan(memory).any().item()}")
-                    print(f"  Has Inf: {torch.isinf(memory).any().item()}")
-                    # Check first few positions
-                    print(f"  First position norm: {memory[0, 0].norm().item():.4f}")
-                    print(f"  Last position norm: {memory[0, -1].norm().item():.4f}")
-                # Ban special tokens from generation
-                ban_token_ids = [self.tokenizer.bos_token_id, self.tokenizer.pad_token_id]
-                unk_id = getattr(self.tokenizer._tokenizer, "unk_token_id", None)
-                if isinstance(unk_id, int):
-                    ban_token_ids.append(unk_id)
-                ban_token_ids = [tid for tid in ban_token_ids if tid is not None]
-                # Generate using naive method (full forward, O(N^2)) for debugging
-                generated = self.model.decoder.greedy_decode_naive(
                     memory=memory,
                     max_len=self.config.validation_max_length,
                     start_token_id=self.tokenizer.bos_token_id,
@@ -441,139 +376,17 @@ class Trainer:
                     memory_mask=src_mask,
                 )
-                # Decode
-                source_text = self.tokenizer.decode(src_ids[0].tolist())
-                generated_text = self.tokenizer.decode(generated[0].tolist())
-                reference_text = self._decode_labels(labels)[0]
-                print(f"\nSample {samples_generated + 1}:")
-                print(
-                    f"Raw token IDs: {generated[0][:20].tolist()}..."
-                )  # Debug: show first 20 tokens
-                print(
-                    f"Source: {source_text[:200]}..."
-                    if len(source_text) > 200
-                    else f"Source: {source_text}"
-                )
-                print(f"Generated: {generated_text}")
-                print(
-                    f"Reference: {reference_text[:200]}..."
-                    if len(reference_text) > 200
-                    else f"Reference: {reference_text}"
-                )
-                print("-" * 80)
-                samples_generated += 1
-        print(f"{'=' * 80}\n")
         self.model.train()
-    def _print_epoch_progress(
-        self,
-        epoch: int,
-        total_epochs: int,
-        epoch_duration: float,
-        total_elapsed: float,
-    ) -> None:
-        progress = epoch / total_epochs
-        percent = progress * 100
-        remaining_epochs = total_epochs - epoch
-        eta = (total_elapsed / epoch) * remaining_epochs if epoch > 0 else 0.0
-        bar = self._format_progress_bar(progress)
-        message = (
-            f"[progress] {bar} {percent:5.1f}% | epoch {epoch}/{total_epochs} "
-            f"| last {epoch_duration:6.2f}s | total {total_elapsed:6.2f}s | ETA {eta:6.2f}s"
-        )
-        print(message, flush=True)
-    @staticmethod
-    def _format_progress_bar(progress: float, width: int = 20) -> str:
-        clamped = max(0.0, min(1.0, progress))
-        filled = int(round(clamped * width))
-        bar = "#" * filled + "-" * (width - filled)
-        return f"[{bar}]"
-    def _update_epoch_progress(
-        self,
-        *,
-        epoch: int,
-        total_epochs: int,
-        step: int,
-        total_steps: int,
-        epoch_start: float,
-        global_start: float,
-        final: bool = False,
-    ) -> None:
-        if total_steps <= 0 or total_epochs <= 0:
-            return
-        bounded_step = max(0, min(step, total_steps))
-        step_fraction = bounded_step / total_steps
-        epochs_completed = (epoch - 1) + step_fraction
-        overall_progress = epochs_completed / total_epochs
-        percent = overall_progress * 100.0
-        epoch_elapsed = time.perf_counter() - epoch_start
-        total_elapsed = time.perf_counter() - global_start
-        if epochs_completed > 0:
-            remaining_epochs = max(total_epochs - epochs_completed, 0.0)
-            total_eta = (
-                (total_elapsed / epochs_completed) * remaining_epochs if total_elapsed > 0 else 0.0
-            )
-        else:
-            total_eta = 0.0
-        if step > 0:
-            epoch_eta = (epoch_elapsed / step) * (total_steps - step)
-        else:
-            epoch_eta = 0.0
-        bar = self._format_progress_bar(overall_progress, width=self._progress_bar_width())
-        message = (
-            f"[progress] {bar} {percent:5.1f}% "
-            f"e {epoch}/{total_epochs} "
-            f"s {bounded_step}/{total_steps} "
-            f"ep_eta {self._format_duration(epoch_eta)} "
-            f"tot_eta {self._format_duration(total_eta)}"
-        )
-        display = self._truncate_to_terminal(message)
-        padding = " " * max(self._progress_last_len - len(display), 0)
-        print(f"\r{display}{padding}", end="", flush=True)
-        if final:
-            print()
-            self._progress_last_len = 0
-        else:
-            self._progress_last_len = len(display)
-    def _truncate_to_terminal(self, text: str) -> str:
-        columns = self._terminal_width()
-        if columns <= 0:
-            return text
-        if len(text) >= columns:
-            return text[: max(columns - 1, 1)]
-        return text
-    def _progress_bar_width(self) -> int:
-        columns = self._terminal_width()
-        reserved = 60
-        if columns <= reserved:
-            return 10
-        return max(10, min(30, columns - reserved))
-    @staticmethod
-    def _terminal_width() -> int:
-        try:
-            return shutil.get_terminal_size(fallback=(120, 20)).columns
-        except OSError:
-            return 120
-    @staticmethod
-    def _format_duration(seconds: float) -> str:
-        seconds = max(0.0, seconds)
-        if seconds >= 3600:
-            hours = int(seconds // 3600)
-            minutes = int((seconds % 3600) // 60)
-            return f"{hours}h{minutes:02}m"
-        if seconds >= 60:
-            minutes = int(seconds // 60)
-            secs = int(seconds % 60)
-            return f"{minutes}m{secs:02}s"
-        return f"{seconds:4.1f}s"

+"""
+Multi-task Trainer for LexiMind.
+Handles training across summarization, emotion, and topic heads with mixed-precision,
+gradient accumulation, and MLflow logging.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations
 import time
 from collections import defaultdict
 from dataclasses import dataclass
+from typing import Any, Callable, Dict, List
 import mlflow
 import torch
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 from ..data.tokenization import Tokenizer
 from .metrics import accuracy, multilabel_f1, rouge_like
+# --------------- Configuration ---------------
 @dataclass
 class TrainerConfig:
+    """Training hyperparameters."""
     max_epochs: int = 1
     gradient_clip_norm: float = 1.0
     task_weights: Dict[str, float] | None = None
     validation_samples: int = 3
     validation_max_length: int = 128
+    label_smoothing: float = 0.0
     experiment_name: str = "LexiMind"
     run_name: str | None = None
     gradient_accumulation_steps: int = 1
+# --------------- Trainer ---------------
 class Trainer:
+    """Multi-task trainer with AMP and gradient accumulation."""
     def __init__(
         self,
         self.config = config
         self.device = device
         self.tokenizer = tokenizer
+        # Task losses
         self.emotion_loss = torch.nn.BCEWithLogitsLoss()
         self.topic_loss = torch.nn.CrossEntropyLoss()
+        # AMP setup: bfloat16 for Ampere+ GPUs, float16 otherwise
+        self.use_amp = device.type == "cuda"
+        self.use_bfloat16 = self.use_amp and torch.cuda.is_bf16_supported()
+        self.scaler = torch.GradScaler("cuda", enabled=(self.use_amp and not self.use_bfloat16))
+        self._nan_counter = 0
         mlflow.set_experiment(config.experiment_name)
+        # CUDA optimizations
+        if device.type == "cuda":
+            torch.backends.cuda.enable_flash_sdp(True)
+            torch.backends.cuda.enable_mem_efficient_sdp(True)
+    # --------------- Training Loop ---------------
     def fit(
         self,
         train_loaders: Dict[str, DataLoader],
         val_loaders: Dict[str, DataLoader] | None = None,
         checkpoint_callback: Callable | None = None,
     ) -> Dict[str, Dict[str, float]]:
+        """Train model across all tasks with progress tracking."""
         history: Dict[str, Dict[str, float]] = {}
+        total_start = time.perf_counter()
         with mlflow.start_run(run_name=self.config.run_name):
+            self._log_config()
+            # Epoch progress bar
+            epoch_pbar = tqdm(
+                range(1, self.config.max_epochs + 1),
+                desc="Training",
+                unit="epoch",
+                position=0,
             )
+            for epoch in epoch_pbar:
                 epoch_start = time.perf_counter()
+                # Train
+                train_metrics = self._run_epoch(train_loaders, train=True, epoch=epoch)
+                history[f"train_epoch_{epoch}"] = train_metrics
+                self._log_metrics(train_metrics, "train", epoch)
+                # Validate
                 if val_loaders:
                     val_metrics = self._run_epoch(val_loaders, train=False, epoch=epoch)
                     history[f"val_epoch_{epoch}"] = val_metrics
+                    self._log_metrics(val_metrics, "val", epoch)
                     if "summarization" in val_loaders:
                         self._validate_generation(val_loaders["summarization"], epoch)
+                # Checkpoint
+                if checkpoint_callback:
                     checkpoint_callback(epoch, self.model, history)
+                # Update epoch progress bar with metrics
+                epoch_time = time.perf_counter() - epoch_start
+                total_time = time.perf_counter() - total_start
+                desc = f"Epoch {epoch}/{self.config.max_epochs}"
+                if "total_loss" in train_metrics:
+                    desc += f" | loss={train_metrics['total_loss']:.3f}"
+                epoch_pbar.set_description(desc)
+                epoch_pbar.set_postfix(
+                    {"time": f"{epoch_time:.1f}s", "total": f"{total_time:.1f}s"}
+                )
+        total_time = time.perf_counter() - total_start
+        print(f"\n✓ Training complete in {total_time:.1f}s")
         return history
+    def _log_config(self) -> None:
+        """Log config to MLflow."""
+        mlflow.log_params(
+            {
+                "max_epochs": self.config.max_epochs,
+                "gradient_clip_norm": self.config.gradient_clip_norm,
+                "label_smoothing": self.config.label_smoothing,
+                "task_weights": str(self.config.task_weights),
+            }
+        )
+    def _log_metrics(self, metrics: Dict[str, float], prefix: str, epoch: int) -> None:
+        """Log metrics to MLflow."""
+        for k, v in metrics.items():
+            if k != "epoch":
+                mlflow.log_metric(f"{prefix}_{k}", v, step=epoch)
+    # --------------- Epoch Execution ---------------
     def _run_epoch(
         self,
         loaders: Dict[str, DataLoader],
         *,
         train: bool,
         epoch: int,
     ) -> Dict[str, float]:
+        """Run one epoch with progress bar."""
+        phase = "Train" if train else "Val"
         self.model.train(train)
+        metrics: Dict[str, List[float]] = defaultdict(list)
+        iterators = {task: iter(loader) for task, loader in loaders.items()}
         max_batches = max(len(loader) for loader in loaders.values())
+        accum_steps = self.config.gradient_accumulation_steps
+        # Batch progress bar (nested under epoch bar)
+        pbar = tqdm(
+            range(max_batches),
+            desc=f"  {phase}",
+            unit="batch",
+            leave=False,
+            position=1,
         )
         context = torch.enable_grad() if train else torch.no_grad()
         with context:
+            for step in pbar:
+                step_loss = 0.0
                 for task, loader in loaders.items():
+                    batch = self._get_batch(iterators, loader, task)
                     if batch is None:
                         continue
+                    # Forward with AMP
+                    amp_dtype = torch.bfloat16 if self.use_bfloat16 else torch.float16
+                    with torch.autocast("cuda", dtype=amp_dtype, enabled=self.use_amp):
+                        loss, task_metrics = self._forward_task(task, batch)
+                    # NaN check
                     if torch.isnan(loss):
+                        self._nan_counter += 1
+                        if self._nan_counter > 10:
+                            raise RuntimeError("Training diverging - too many NaN losses")
                         continue
+                    self._nan_counter = 0
+                    # Record metrics
+                    metrics[f"{task}_loss"].append(loss.item())
+                    for name, val in task_metrics.items():
+                        metrics[f"{task}_{name}"].append(val)
+                    # Backward
                     if train:
+                        weight = (self.config.task_weights or {}).get(task, 1.0)
+                        scaled = (loss * weight) / accum_steps
+                        step_loss += scaled.item() * accum_steps
+                        if self.use_bfloat16:
+                            scaled.backward()
                         else:
+                            self.scaler.scale(scaled).backward()
+                # Optimizer step
+                if train and (step + 1) % accum_steps == 0:
+                    self._optimizer_step()
+                if step_loss > 0:
+                    metrics["total_loss"].append(step_loss)
+                # Update progress bar
+                if metrics["total_loss"]:
+                    pbar.set_postfix({"loss": f"{metrics['total_loss'][-1]:.3f}"})
+        # Average and print summary
+        averaged = {k: sum(v) / len(v) for k, v in metrics.items() if v}
         averaged["epoch"] = float(epoch)
+        summary = f"[{phase.lower()}] epoch {epoch}: "
+        summary += ", ".join(f"{k}={v:.4f}" for k, v in averaged.items() if k != "epoch")
+        tqdm.write(summary)
         return averaged
+    def _optimizer_step(self) -> None:
+        """Optimizer step with gradient clipping."""
+        if self.use_bfloat16:
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.gradient_clip_norm)
+            self.optimizer.step()
+        else:
+            self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.gradient_clip_norm)
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+        self.optimizer.zero_grad()
+    def _get_batch(
+        self, iterators: Dict, loader: DataLoader, task: str
     ) -> Dict[str, torch.Tensor] | None:
+        """Get next batch, cycling iterator if exhausted."""
         try:
+            batch = next(iterators[task])
         except StopIteration:
+            iterators[task] = iter(loader)
             try:
+                batch = next(iterators[task])
             except StopIteration:
                 return None
         return {
+            k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v
+            for k, v in batch.items()
         }
+    # --------------- Task Forward Passes ---------------
     def _forward_task(
+        self, task: str, batch: Dict[str, torch.Tensor]
     ) -> tuple[torch.Tensor, Dict[str, float]]:
+        """Route to task-specific forward pass."""
         if task == "summarization":
+            return self._forward_summarization(batch)
+        elif task == "emotion":
+            return self._forward_emotion(batch)
+        elif task == "topic":
+            return self._forward_topic(batch)
+        raise ValueError(f"Unknown task: {task}")
+    def _forward_summarization(
+        self, batch: Dict[str, torch.Tensor]
+    ) -> tuple[torch.Tensor, Dict[str, float]]:
+        """Seq2seq forward for summarization."""
+        inputs = {"src_ids": batch["src_ids"], "tgt_ids": batch["tgt_ids"]}
+        if "src_mask" in batch:
+            inputs["src_mask"] = batch["src_mask"]
+        logits = self.model.forward("summarization", inputs)
+        loss = F.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            batch["labels"].view(-1),
+            ignore_index=-100,
+            label_smoothing=self.config.label_smoothing,
+        )
+        # Quick ROUGE estimate
+        preds = self.tokenizer.decode_batch(logits.argmax(dim=-1).tolist())
+        refs = self._decode_labels(batch["labels"])
+        return loss, {"rouge_like": rouge_like(preds, refs)}
+    def _forward_emotion(
+        self, batch: Dict[str, torch.Tensor]
+    ) -> tuple[torch.Tensor, Dict[str, float]]:
+        """Multi-label emotion classification."""
+        inputs = {"input_ids": batch["input_ids"]}
+        if "attention_mask" in batch:
+            inputs["attention_mask"] = batch["attention_mask"]
+        logits = self.model.forward("emotion", inputs)
+        loss = self.emotion_loss(logits, batch["labels"].float())
+        preds = (torch.sigmoid(logits) > 0.5).int()
+        return loss, {"f1": multilabel_f1(preds, batch["labels"].int())}
+    def _forward_topic(
+        self, batch: Dict[str, torch.Tensor]
+    ) -> tuple[torch.Tensor, Dict[str, float]]:
+        """Single-label topic classification."""
+        inputs = {"input_ids": batch["input_ids"]}
+        if "attention_mask" in batch:
+            inputs["attention_mask"] = batch["attention_mask"]
+        logits = self.model.forward("topic", inputs)
+        loss = self.topic_loss(logits, batch["labels"])
+        preds = logits.argmax(dim=-1)
+        return loss, {"accuracy": accuracy(preds.tolist(), batch["labels"].tolist())}
     def _decode_labels(self, labels: torch.Tensor) -> List[str]:
+        """Decode labels, replacing -100 with pad token."""
         valid = labels.clone()
         valid[valid == -100] = self.tokenizer.pad_token_id
         return self.tokenizer.decode_batch(valid.tolist())
+    # --------------- Validation Generation ---------------
     def _validate_generation(self, val_loader: DataLoader, epoch: int) -> None:
+        """Generate sample summaries for quality check."""
         self.model.eval()
+        n = self.config.validation_samples
+        tqdm.write(f"\n{'=' * 50}")
+        tqdm.write(f"[Validation Samples - Epoch {epoch}]")
+        tqdm.write(f"{'=' * 50}")
         with torch.no_grad():
+            for i, batch in enumerate(val_loader):
+                if i >= n:
                     break
                 batch = {
                     k: v.to(self.device) if isinstance(v, torch.Tensor) else v
                     for k, v in batch.items()
                 }
+                src_ids = batch["src_ids"][:1]
                 src_mask = batch.get("src_mask")
                 if src_mask is not None:
                     src_mask = src_mask[:1]
+                # Encode and generate
+                enc_mask = (
+                    src_mask.unsqueeze(1) & src_mask.unsqueeze(2) if src_mask is not None else None
+                )
+                model: Any = self.model
+                memory = model.encoder(src_ids, mask=enc_mask)
+                generated = model.decoder.greedy_decode_naive(
                     memory=memory,
                     max_len=self.config.validation_max_length,
                     start_token_id=self.tokenizer.bos_token_id,
                     memory_mask=src_mask,
                 )
+                # Decode and display
+                src = self.tokenizer.decode(src_ids[0].tolist())
+                out = self.tokenizer.decode(generated[0].tolist())
+                ref = self._decode_labels(batch["labels"][:1])[0]
+                tqdm.write(f"\nSample {i + 1}:")
+                tqdm.write(f"  Source: {src[:120]}..." if len(src) > 120 else f"  Source: {src}")
+                tqdm.write(f"  Generated: {out}")
+                tqdm.write(
+                    f"  Reference: {ref[:120]}..." if len(ref) > 120 else f"  Reference: {ref}"
+                )
+        tqdm.write(f"{'=' * 50}\n")
         self.model.train()

src/training/utils.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Small training helpers."""
 from __future__ import annotations

+"""
+Training utilities for LexiMind.
+Provides reproducibility helpers including seed management for stdlib, PyTorch,
+and NumPy random number generators with thread-safe spawning support.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/utils/config.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""YAML config loader."""
 from dataclasses import dataclass
 from pathlib import Path

+"""
+Configuration utilities for LexiMind.
+Provides YAML configuration loading with validation.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from dataclasses import dataclass
 from pathlib import Path

src/utils/io.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""Checkpoint IO helpers."""
 from pathlib import Path

+"""
+Checkpoint I/O utilities for LexiMind.
+Handles model state serialization with support for torch.compile artifacts.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from pathlib import Path

src/utils/labels.py CHANGED Viewed

@@ -1,4 +1,12 @@
-"""Label metadata helpers for multitask inference."""
 from __future__ import annotations

+"""
+Label metadata utilities for LexiMind.
+Manages persistence and loading of emotion and topic label vocabularies
+for multitask inference.
+Author: Oliver Perrin
+Date: December 2025
+"""
 from __future__ import annotations

src/utils/logging.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""Logging setup."""
 import logging

+"""
+Logging utilities for LexiMind.
+Provides centralized logging configuration and logger factory.
+Author: Oliver Perrin
+Date: December 2025
+"""
 import logging

src/utils/random.py CHANGED Viewed

@@ -1,4 +1,11 @@
-"""Randomness helpers."""
 import random

+"""
+Randomness utilities for LexiMind.
+Provides seed management for reproducibility.
+Author: Oliver Perrin
+Date: December 2025
+"""
 import random

tests/test_training/test_trainer.py CHANGED Viewed

@@ -17,7 +17,7 @@ class TestTrainer(unittest.TestCase):
         self.model = MagicMock()
         self.model.to.return_value = self.model  # Ensure .to() returns the same mock
         self.optimizer = MagicMock(spec=torch.optim.Optimizer)
-        self.config = TrainerConfig(max_epochs=1, logging_interval=1)
         self.device = torch.device("cpu")
         self.tokenizer = MagicMock()
         self.tokenizer.pad_token_id = 0

         self.model = MagicMock()
         self.model.to.return_value = self.model  # Ensure .to() returns the same mock
         self.optimizer = MagicMock(spec=torch.optim.Optimizer)
+        self.config = TrainerConfig(max_epochs=1)
         self.device = torch.device("cpu")
         self.tokenizer = MagicMock()
         self.tokenizer.pad_token_id = 0