train set
TRAIN_DATASET = "sayakpaul/hf-codegen-v2" TRAIN_BASE_MODEL = "meta-llama/Llama-3.2-1B"
TRAIN_MODEL_HUB_NAME = "steve329" TRAIN_TUNED_MODEL_NAME = "test_fine_llama3_qlora_all_3" TRAIN_PEFT_LAYERS = "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" TRAIN_LORA_ALPHA = 16 TRAIN_LORA_DROPOUT = 0.05 TRAIN_LORA_R = 8
TRAIN_SAMPLES = 1000 TRAIN_VAL_SAMPLES = 10 # about 30 steps per val sample TRAIN_SEQ_LEN = 1024 TRAIN_MAX_STEPS = 400 TRAIN_VAL_STEPS = 80 TRAIN_SAVE_STEPS = 1000 TRAIN_LOG_STEPS = 10 TRAIN_TRAIN_BSIZE = 1 TRAIN_VAL_BSIZE = 1 TRAIN_LR = 5e-5 TRAIN_LR_TYPE = "cosine" TRAIN_WARMUP_STEPS = 0 TRAIN_FP16 = True TRAIN_BF16 = False TRAIN_ACCU_STEPS = 1 TRAIN_WEIGHT_DECAY = 0.01
test set
TEST_NUM_SAMPLES_PER_TASK = 15 TEST_NUM_TASK_IDS = 50 TEST_MAX_NEW_TOEKNS = 512
TEST_TEMPERATURE = 0.2 TEST_TOP_K = 50 TEST_TOP_P = 0.95 TEST_DO_SAMPLE = True TEST_REPETITION_PENALTY = 1.0
--- tuning ---
trainable params: 5,636,096 || all params: 1,241,450,496 || trainable%: 0.4540
Resolving data files: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 103/103 [00:00<00:00, 258.99it/s]
max_steps is given, it will override any value given in num_train_epochs
0%| | 0/400 [00:00<?, ?it/s]The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
{'loss': 1.8204, 'grad_norm': 1.3328591585159302, 'learning_rate': 4.99229333433282e-05, 'epoch': 0.03}
{'loss': 1.2134, 'grad_norm': 1.060073733329773, 'learning_rate': 4.9692208514878444e-05, 'epoch': 0.05}
{'loss': 1.0953, 'grad_norm': 1.7300605773925781, 'learning_rate': 4.9309248009941914e-05, 'epoch': 0.07}
{'loss': 1.0424, 'grad_norm': 2.2006287574768066, 'learning_rate': 4.877641290737884e-05, 'epoch': 0.1}
{'loss': 1.4761, 'grad_norm': 2.6896536350250244, 'learning_rate': 4.8096988312782174e-05, 'epoch': 0.12}
{'loss': 1.9322, 'grad_norm': 1.9243583679199219, 'learning_rate': 4.72751631047092e-05, 'epoch': 0.15}
{'loss': 1.484, 'grad_norm': 2.2710037231445312, 'learning_rate': 4.6316004108852305e-05, 'epoch': 0.17}
{'loss': 1.047, 'grad_norm': 2.4178426265716553, 'learning_rate': 4.522542485937369e-05, 'epoch': 0.2}
20%|βββββββββββββββββ | 80/400 [01:09<04:34, 1.17it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (199878 > 131072). Running this sequence through the model will result in indexing errors
{'eval_loss': 1.3107608556747437, 'eval_runtime': 152.7934, 'eval_samples_per_second': 3.357, 'eval_steps_per_second': 3.357, 'epoch': 0.2}
{'loss': 1.7164, 'grad_norm': 1.8474045991897583, 'learning_rate': 4.401014914000078e-05, 'epoch': 0.23}
{'loss': 1.8529, 'grad_norm': 2.2887606620788574, 'learning_rate': 4.267766952966369e-05, 'epoch': 0.25}
{'loss': 1.1273, 'grad_norm': 2.1084353923797607, 'learning_rate': 4.123620120825459e-05, 'epoch': 0.28}
{'loss': 0.6202, 'grad_norm': 1.7139441967010498, 'learning_rate': 3.969463130731183e-05, 'epoch': 0.3}
{'loss': 0.6318, 'grad_norm': 2.10313081741333, 'learning_rate': 3.8062464117898724e-05, 'epoch': 0.33}
{'loss': 1.7454, 'grad_norm': 2.7863190174102783, 'learning_rate': 3.634976249348867e-05, 'epoch': 0.35}
{'loss': 1.2459, 'grad_norm': 3.6123578548431396, 'learning_rate': 3.456708580912725e-05, 'epoch': 0.38}
{'loss': 1.2299, 'grad_norm': 2.599963903427124, 'learning_rate': 3.272542485937369e-05, 'epoch': 0.4}
{'eval_loss': 1.1106401681900024, 'eval_runtime': 151.9897, 'eval_samples_per_second': 3.375, 'eval_steps_per_second': 3.375, 'epoch': 0.4}
{'loss': 0.4903, 'grad_norm': 2.36421275138855, 'learning_rate': 3.083613409639764e-05, 'epoch': 0.42}
{'loss': 1.1203, 'grad_norm': 3.1779987812042236, 'learning_rate': 2.8910861626005776e-05, 'epoch': 0.45}
{'loss': 1.1351, 'grad_norm': 2.7350029945373535, 'learning_rate': 2.6961477393196126e-05, 'epoch': 0.47}
{'loss': 1.2146, 'grad_norm': 2.880659341812134, 'learning_rate': 2.5e-05, 'epoch': 0.5}
{'loss': 1.0687, 'grad_norm': 2.37646222114563, 'learning_rate': 2.303852260680388e-05, 'epoch': 0.53}
{'loss': 1.1508, 'grad_norm': 2.899869203567505, 'learning_rate': 2.1283189152576925e-05, 'epoch': 0.55}
{'loss': 0.8881, 'grad_norm': 2.137705087661743, 'learning_rate': 1.935496832827241e-05, 'epoch': 0.57}
{'loss': 0.3349, 'grad_norm': 1.7815237045288086, 'learning_rate': 1.7648991869192405e-05, 'epoch': 0.6}
{'eval_loss': 0.9981011152267456, 'eval_runtime': 152.1964, 'eval_samples_per_second': 3.371, 'eval_steps_per_second': 3.371, 'epoch': 0.6}
{'loss': 1.0572, 'grad_norm': 2.96177339553833, 'learning_rate': 1.5796886182883053e-05, 'epoch': 0.62}
{'loss': 1.0081, 'grad_norm': 2.4514148235321045, 'learning_rate': 1.4001520753602121e-05, 'epoch': 0.65}
{'loss': 1.2058, 'grad_norm': 2.4870169162750244, 'learning_rate': 1.2273964606240718e-05, 'epoch': 0.68}
{'loss': 0.7643, 'grad_norm': 2.348313331604004, 'learning_rate': 1.0624868698918045e-05, 'epoch': 0.7}
{'loss': 1.0911, 'grad_norm': 3.0336053371429443, 'learning_rate': 9.064400256282757e-06, 'epoch': 0.72}
{'loss': 1.0361, 'grad_norm': 2.483652114868164, 'learning_rate': 7.602180085192143e-06, 'epoch': 0.75}
{'loss': 1.4973, 'grad_norm': 2.5220589637756348, 'learning_rate': 6.247223259238511e-06, 'epoch': 0.78}
{'loss': 1.166, 'grad_norm': 1.9077987670898438, 'learning_rate': 5.007883537822736e-06, 'epoch': 0.8}
{'eval_loss': 0.9547628164291382, 'eval_runtime': 153.3023, 'eval_samples_per_second': 3.346, 'eval_steps_per_second': 3.346, 'epoch': 0.8}
{'loss': 0.2757, 'grad_norm': 2.0077965259552, 'learning_rate': 3.891801862449629e-06, 'epoch': 0.82}
{'loss': 0.3701, 'grad_norm': 1.6974297761917114, 'learning_rate': 2.9058592477826636e-06, 'epoch': 0.85}
{'loss': 0.3, 'grad_norm': 1.4839575290679932, 'learning_rate': 2.0561343579004715e-06, 'epoch': 0.88}
{'loss': 0.3358, 'grad_norm': 2.096834182739258, 'learning_rate': 1.3478660293113676e-06, 'epoch': 0.9}
{'loss': 1.0767, 'grad_norm': 4.288107872009277, 'learning_rate': 7.854209717842231e-07, 'epoch': 0.93}
{'loss': 1.1035, 'grad_norm': 3.2066102027893066, 'learning_rate': 3.7226684613065333e-07, 'epoch': 0.95}
{'loss': 0.7382, 'grad_norm': 1.637189269065857, 'learning_rate': 1.109508849230001e-07, 'epoch': 0.97}
{'loss': 0.888, 'grad_norm': 3.4519171714782715, 'learning_rate': 3.0841879584853073e-09, 'epoch': 1.0}
{'eval_loss': 0.954106867313385, 'eval_runtime': 153.0076, 'eval_samples_per_second': 3.353, 'eval_steps_per_second': 3.353, 'epoch': 1.0}
{'train_runtime': 1109.9308, 'train_samples_per_second': 0.36, 'train_steps_per_second': 0.36, 'train_tokens_per_second': 369.032, 'train_loss': 1.0649387103319168, 'epoch': 1.0}
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 400/400 [18:29<00:00, 2.77s/it]
--- total: 1128.085711479187 s
--- evaluation ---
eval_problem_0
eval_problem_1
eval_problem_2
eval_problem_3
eval_problem_4
eval_problem_5
eval_problem_6
eval_problem_7
eval_problem_8
eval_problem_9
eval_problem_10
eval_problem_11
eval_problem_12
eval_problem_13
eval_problem_14
eval_problem_15
eval_problem_16
eval_problem_17
eval_problem_18
eval_problem_19
eval_problem_20
eval_problem_21
eval_problem_22
eval_problem_23
eval_problem_24
eval_problem_25
eval_problem_26
eval_problem_27
eval_problem_28
eval_problem_29
eval_problem_30
eval_problem_31
eval_problem_32
eval_problem_33
eval_problem_34
eval_problem_35
eval_problem_36
eval_problem_37
eval_problem_38
eval_problem_39
eval_problem_40
eval_problem_41
eval_problem_42
eval_problem_43
eval_problem_44
eval_problem_45
eval_problem_46
eval_problem_47
eval_problem_48
eval_problem_49
--- total: 3349.682242155075 s