YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
train set
TRAIN_DATASET = "angie-chen55/python-github-code" TRAIN_BASE_MODEL = "meta-llama/Llama-3.2-1B"
TRAIN_MODEL_HUB_NAME = "steve329" TRAIN_TUNED_MODEL_NAME = "test_fine_llama3_qlora_all_7" TRAIN_PEFT_LAYERS = "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" TRAIN_LORA_ALPHA = 16 TRAIN_LORA_DROPOUT = 0.05 TRAIN_LORA_R = 8
TRAIN_SAMPLES = 5000 TRAIN_VAL_SAMPLES = 250 TRAIN_SEQ_LEN = 1024 TRAIN_MAX_STEPS = 1200 TRAIN_VAL_STEPS = 80 TRAIN_SAVE_STEPS = 1000 TRAIN_LOG_STEPS = 10 TRAIN_TRAIN_BSIZE = 1 TRAIN_VAL_BSIZE = 1 TRAIN_LR = 1e-5 TRAIN_LR_TYPE = "cosine" TRAIN_WARMUP_STEPS = 0 TRAIN_FP16 = True TRAIN_BF16 = False TRAIN_ACCU_STEPS = 1 TRAIN_WEIGHT_DECAY = 0.01
test set
TEST_NUM_SAMPLES_PER_TASK = 15 TEST_NUM_TASK_IDS = 50 TEST_MAX_NEW_TOEKNS = 512
TEST_TEMPERATURE = 0.2 TEST_TOP_K = 50 TEST_TOP_P = 0.95 TEST_DO_SAMPLE = True TEST_REPETITION_PENALTY = 1.0
--- tuning ---
trainable params: 5,636,096 || all params: 1,241,450,496 || trainable%: 0.4540
Resolving data files: 100%|βββββββββββββββββββββββββββββββββββββββββββ| 108/108 [00:00<00:00, 400.68it/s]
max_steps is given, it will override any value given in num_train_epochs
0%| | 0/1200 [00:00<?, ?it/s]The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
{'loss': 1.2925, 'grad_norm': 0.6555778384208679, 'learning_rate': 9.998286624877786e-06, 'epoch': 0.01}
{'loss': 1.5304, 'grad_norm': 0.8546385765075684, 'learning_rate': 9.993147673772869e-06, 'epoch': 0.02}
{'loss': 1.4818, 'grad_norm': 0.7580430507659912, 'learning_rate': 9.984586668665641e-06, 'epoch': 0.03}
{'loss': 1.4858, 'grad_norm': 0.8087210655212402, 'learning_rate': 9.972609476841368e-06, 'epoch': 0.03}
{'loss': 1.365, 'grad_norm': 0.668307363986969, 'learning_rate': 9.957224306869053e-06, 'epoch': 0.04}
{'loss': 1.5286, 'grad_norm': 0.8724182844161987, 'learning_rate': 9.938441702975689e-06, 'epoch': 0.05}
{'loss': 1.5563, 'grad_norm': 0.5869552493095398, 'learning_rate': 9.916274537819774e-06, 'epoch': 0.06}
{'loss': 1.2979, 'grad_norm': 0.8597727417945862, 'learning_rate': 9.890738003669029e-06, 'epoch': 0.07}
{'eval_loss': 1.3621528148651123, 'eval_runtime': 166.1509, 'eval_samples_per_second': 3.449, 'eval_steps_per_second': 3.449, 'epoch': 0.07}
{'loss': 1.7059, 'grad_norm': 0.9398770928382874, 'learning_rate': 9.861849601988384e-06, 'epoch': 0.07}
{'loss': 1.5394, 'grad_norm': 0.8774957060813904, 'learning_rate': 9.829629131445342e-06, 'epoch': 0.08}
{'loss': 1.5158, 'grad_norm': 0.8455575704574585, 'learning_rate': 9.794098674340966e-06, 'epoch': 0.09}
{'loss': 1.0623, 'grad_norm': 0.7278249263763428, 'learning_rate': 9.755282581475769e-06, 'epoch': 0.1}
{'loss': 1.5046, 'grad_norm': 1.7426875829696655, 'learning_rate': 9.713207455460893e-06, 'epoch': 0.11}
{'loss': 1.2302, 'grad_norm': 0.8249136805534363, 'learning_rate': 9.667902132486009e-06, 'epoch': 0.12}
{'loss': 1.6462, 'grad_norm': 1.044198989868164, 'learning_rate': 9.619397662556434e-06, 'epoch': 0.12}
{'loss': 1.7248, 'grad_norm': 0.9166670441627502, 'learning_rate': 9.567727288213005e-06, 'epoch': 0.13}
{'eval_loss': 1.3530688285827637, 'eval_runtime': 166.8718, 'eval_samples_per_second': 3.434, 'eval_steps_per_second': 3.434, 'epoch': 0.13}
{'loss': 1.5593, 'grad_norm': 1.1306487321853638, 'learning_rate': 9.512926421749305e-06, 'epoch': 0.14}
{'loss': 1.3793, 'grad_norm': 1.521423101425171, 'learning_rate': 9.45503262094184e-06, 'epoch': 0.15}
{'loss': 1.3946, 'grad_norm': 0.8901950716972351, 'learning_rate': 9.394085563309827e-06, 'epoch': 0.16}
{'loss': 1.1316, 'grad_norm': 0.9611344933509827, 'learning_rate': 9.330127018922195e-06, 'epoch': 0.17}
{'loss': 1.4999, 'grad_norm': 0.9895099401473999, 'learning_rate': 9.263200821770462e-06, 'epoch': 0.17}
{'loss': 1.3442, 'grad_norm': 0.9981682896614075, 'learning_rate': 9.193352839727122e-06, 'epoch': 0.18}
{'loss': 1.3114, 'grad_norm': 1.0121331214904785, 'learning_rate': 9.120630943110078e-06, 'epoch': 0.19}
{'loss': 1.3136, 'grad_norm': 1.0117032527923584, 'learning_rate': 9.045084971874738e-06, 'epoch': 0.2}
{'eval_loss': 1.346709966659546, 'eval_runtime': 166.4293, 'eval_samples_per_second': 3.443, 'eval_steps_per_second': 3.443, 'epoch': 0.2}
{'loss': 1.3559, 'grad_norm': 2.3755764961242676, 'learning_rate': 8.966766701456177e-06, 'epoch': 0.21}
{'loss': 1.209, 'grad_norm': 0.98636394739151, 'learning_rate': 8.885729807284855e-06, 'epoch': 0.22}
{'loss': 1.3446, 'grad_norm': 1.3307329416275024, 'learning_rate': 8.802029828000157e-06, 'epoch': 0.23}
{'loss': 1.2605, 'grad_norm': 0.8003533482551575, 'learning_rate': 8.715724127386971e-06, 'epoch': 0.23}
{'loss': 1.2209, 'grad_norm': 0.99559485912323, 'learning_rate': 8.626871855061438e-06, 'epoch': 0.24}
{'loss': 1.3953, 'grad_norm': 1.1494324207305908, 'learning_rate': 8.535533905932739e-06, 'epoch': 0.25}
{'loss': 1.3805, 'grad_norm': 1.0891047716140747, 'learning_rate': 8.44177287846877e-06, 'epoch': 0.26}
{'loss': 1.48, 'grad_norm': 1.145277976989746, 'learning_rate': 8.345653031794292e-06, 'epoch': 0.27}
{'eval_loss': 1.3417248725891113, 'eval_runtime': 166.8068, 'eval_samples_per_second': 3.435, 'eval_steps_per_second': 3.435, 'epoch': 0.27}
{'loss': 1.398, 'grad_norm': 1.0213351249694824, 'learning_rate': 8.247240241650918e-06, 'epoch': 0.28}
{'loss': 1.5404, 'grad_norm': 1.088975429534912, 'learning_rate': 8.146601955249187e-06, 'epoch': 0.28}
{'loss': 1.2308, 'grad_norm': 1.4131255149841309, 'learning_rate': 8.043807145043604e-06, 'epoch': 0.29}
{'loss': 1.4368, 'grad_norm': 1.3954739570617676, 'learning_rate': 7.938926261462366e-06, 'epoch': 0.3}
{'loss': 1.3801, 'grad_norm': 1.317599892616272, 'learning_rate': 7.832031184624165e-06, 'epoch': 0.31}
{'loss': 1.4205, 'grad_norm': 1.0833520889282227, 'learning_rate': 7.723195175075136e-06, 'epoch': 0.32}
{'loss': 1.3659, 'grad_norm': 1.118585467338562, 'learning_rate': 7.612492823579744e-06, 'epoch': 0.33}
{'loss': 1.5178, 'grad_norm': 1.1180721521377563, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.33}
{'eval_loss': 1.3384201526641846, 'eval_runtime': 167.0725, 'eval_samples_per_second': 3.43, 'eval_steps_per_second': 3.43, 'epoch': 0.33}
{'loss': 1.5222, 'grad_norm': 0.9835115671157837, 'learning_rate': 7.3857938012980425e-06, 'epoch': 0.34}
{'loss': 1.349, 'grad_norm': 0.8385874032974243, 'learning_rate': 7.269952498697734e-06, 'epoch': 0.35}
{'loss': 1.2585, 'grad_norm': 0.7762739658355713, 'learning_rate': 7.1525554840414765e-06, 'epoch': 0.36}
{'loss': 0.666, 'grad_norm': 0.9964519739151001, 'learning_rate': 7.033683215379002e-06, 'epoch': 0.37}
37%|ββββββββββββββββββββββββ | 445/1200 [20:14<10:46, 1.17it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (272292 > 131072). Running this sequence through the model will result in indexing errors
{'loss': 1.1939, 'grad_norm': 0.9662719964981079, 'learning_rate': 6.913417161825449e-06, 'epoch': 0.38}
{'loss': 1.7674, 'grad_norm': 0.928888201713562, 'learning_rate': 6.7918397477265e-06, 'epoch': 0.38}
{'loss': 1.6953, 'grad_norm': 0.9261885285377502, 'learning_rate': 6.669034296168855e-06, 'epoch': 0.39}
{'loss': 1.6957, 'grad_norm': 0.896446943283081, 'learning_rate': 6.545084971874738e-06, 'epoch': 0.4}
{'eval_loss': 1.3380820751190186, 'eval_runtime': 166.5377, 'eval_samples_per_second': 3.441, 'eval_steps_per_second': 3.441, 'epoch': 0.4}
{'loss': 1.7077, 'grad_norm': 0.8276058435440063, 'learning_rate': 6.420076723519615e-06, 'epoch': 0.41}
{'loss': 1.6604, 'grad_norm': 1.1022154092788696, 'learning_rate': 6.294095225512604e-06, 'epoch': 0.42}
{'loss': 1.6787, 'grad_norm': 0.9744248390197754, 'learning_rate': 6.1672268192795285e-06, 'epoch': 0.42}
{'loss': 1.6387, 'grad_norm': 0.9200438261032104, 'learning_rate': 6.039558454088796e-06, 'epoch': 0.43}
{'loss': 1.6655, 'grad_norm': 0.8796314001083374, 'learning_rate': 5.911177627460739e-06, 'epoch': 0.44}
{'loss': 1.6385, 'grad_norm': 1.1818307638168335, 'learning_rate': 5.782172325201155e-06, 'epoch': 0.45}
{'loss': 1.7233, 'grad_norm': 1.2186706066131592, 'learning_rate': 5.65263096110026e-06, 'epoch': 0.46}
{'loss': 1.7375, 'grad_norm': 1.024654507637024, 'learning_rate': 5.522642316338268e-06, 'epoch': 0.47}
{'eval_loss': 1.3414212465286255, 'eval_runtime': 166.4493, 'eval_samples_per_second': 3.442, 'eval_steps_per_second': 3.442, 'epoch': 0.47}
{'loss': 1.7259, 'grad_norm': 1.0963331460952759, 'learning_rate': 5.392295478639226e-06, 'epoch': 0.47}
{'loss': 1.7211, 'grad_norm': 1.0060495138168335, 'learning_rate': 5.2616797812147205e-06, 'epoch': 0.48}
{'loss': 1.6977, 'grad_norm': 0.9834098219871521, 'learning_rate': 5.130884741539367e-06, 'epoch': 0.49}
{'loss': 1.7199, 'grad_norm': 1.2639120817184448, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 1.6415, 'grad_norm': 1.2595736980438232, 'learning_rate': 4.869115258460636e-06, 'epoch': 0.51}
{'loss': 1.6914, 'grad_norm': 1.2806637287139893, 'learning_rate': 4.738320218785281e-06, 'epoch': 0.52}
{'loss': 1.6718, 'grad_norm': 1.2325680255889893, 'learning_rate': 4.6077045213607765e-06, 'epoch': 0.53}
{'loss': 1.6827, 'grad_norm': 1.1080881357192993, 'learning_rate': 4.477357683661734e-06, 'epoch': 0.53}
{'eval_loss': 1.3459768295288086, 'eval_runtime': 166.5262, 'eval_samples_per_second': 3.441, 'eval_steps_per_second': 3.441, 'epoch': 0.53}
{'loss': 1.6919, 'grad_norm': 1.2045620679855347, 'learning_rate': 4.347369038899744e-06, 'epoch': 0.54}
{'loss': 1.6638, 'grad_norm': 1.0583103895187378, 'learning_rate': 4.230759150636049e-06, 'epoch': 0.55}
{'loss': 1.6103, 'grad_norm': 1.3172162771224976, 'learning_rate': 4.1016962570340375e-06, 'epoch': 0.56}
{'loss': 1.6251, 'grad_norm': 1.3447258472442627, 'learning_rate': 3.973249015946182e-06, 'epoch': 0.57}
{'loss': 1.6282, 'grad_norm': 1.2206984758377075, 'learning_rate': 3.845505458695438e-06, 'epoch': 0.57}
{'loss': 1.5871, 'grad_norm': 1.6077394485473633, 'learning_rate': 3.7185531343350167e-06, 'epoch': 0.58}
{'loss': 1.5839, 'grad_norm': 1.105552315711975, 'learning_rate': 3.5924790496466233e-06, 'epoch': 0.59}
{'loss': 1.6895, 'grad_norm': 1.6049262285232544, 'learning_rate': 3.4673696095103626e-06, 'epoch': 0.6}
{'eval_loss': 1.349403977394104, 'eval_runtime': 166.2095, 'eval_samples_per_second': 3.447, 'eval_steps_per_second': 3.447, 'epoch': 0.6}
{'loss': 1.7677, 'grad_norm': 1.2751911878585815, 'learning_rate': 3.3433105576871448e-06, 'epoch': 0.61}
{'loss': 1.6491, 'grad_norm': 1.1249381303787231, 'learning_rate': 3.220386918054206e-06, 'epoch': 0.62}
{'loss': 1.6353, 'grad_norm': 1.5168108940124512, 'learning_rate': 3.0986829363339766e-06, 'epoch': 0.62}
{'loss': 1.3929, 'grad_norm': 1.313550591468811, 'learning_rate': 2.9782820223562758e-06, 'epoch': 0.63}
{'loss': 1.3623, 'grad_norm': 2.6660618782043457, 'learning_rate': 2.859266692893386e-06, 'epoch': 0.64}
{'loss': 2.0463, 'grad_norm': 1.5598856210708618, 'learning_rate': 2.741718515107172e-06, 'epoch': 0.65}
{'loss': 1.0672, 'grad_norm': 1.8408468961715698, 'learning_rate': 2.6257180506470283e-06, 'epoch': 0.66}
{'loss': 1.3028, 'grad_norm': 1.6172791719436646, 'learning_rate': 2.5113448004369397e-06, 'epoch': 0.67}
{'eval_loss': 1.3427060842514038, 'eval_runtime': 167.057, 'eval_samples_per_second': 3.43, 'eval_steps_per_second': 3.43, 'epoch': 0.67}
{'loss': 1.4291, 'grad_norm': 1.5286149978637695, 'learning_rate': 2.39867715018953e-06, 'epoch': 0.68}
{'loss': 1.4035, 'grad_norm': 0.8979455232620239, 'learning_rate': 2.2877923166844073e-06, 'epoch': 0.68}
{'loss': 1.3205, 'grad_norm': 1.1117092370986938, 'learning_rate': 2.1787662948476302e-06, 'epoch': 0.69}
{'loss': 0.9564, 'grad_norm': 0.9609827399253845, 'learning_rate': 2.071673805668597e-06, 'epoch': 0.7}
{'loss': 0.9553, 'grad_norm': 0.9876940250396729, 'learning_rate': 1.9665882449900024e-06, 'epoch': 0.71}
{'loss': 1.3581, 'grad_norm': 1.452584147453308, 'learning_rate': 1.8635816332059925e-06, 'epoch': 0.72}
{'loss': 1.7223, 'grad_norm': 1.4269262552261353, 'learning_rate': 1.7627245659029913e-06, 'epoch': 0.72}
{'loss': 1.6229, 'grad_norm': 1.5960813760757446, 'learning_rate': 1.6640861654770007e-06, 'epoch': 0.73}
{'eval_loss': 1.3407820463180542, 'eval_runtime': 166.3153, 'eval_samples_per_second': 3.445, 'eval_steps_per_second': 3.445, 'epoch': 0.73}
{'loss': 1.5611, 'grad_norm': 1.3899459838867188, 'learning_rate': 1.5677340337605817e-06, 'epoch': 0.74}
{'loss': 1.2995, 'grad_norm': 1.4998365640640259, 'learning_rate': 1.473734205691913e-06, 'epoch': 0.75}
{'loss': 1.3506, 'grad_norm': 2.1876413822174072, 'learning_rate': 1.382151104057754e-06, 'epoch': 0.76}
{'loss': 1.4488, 'grad_norm': 1.4815597534179688, 'learning_rate': 1.2930474953412897e-06, 'epoch': 0.77}
{'loss': 1.5299, 'grad_norm': 1.3930795192718506, 'learning_rate': 1.206484446705109e-06, 'epoch': 0.78}
{'loss': 1.4423, 'grad_norm': 1.2058953046798706, 'learning_rate': 1.1225212841388282e-06, 'epoch': 0.78}
{'loss': 1.4201, 'grad_norm': 1.611682415008545, 'learning_rate': 1.0412155518000138e-06, 'epoch': 0.79}
{'loss': 1.2739, 'grad_norm': 1.3232898712158203, 'learning_rate': 9.626229725763003e-07, 'epoch': 0.8}
{'eval_loss': 1.3388594388961792, 'eval_runtime': 165.7108, 'eval_samples_per_second': 3.458, 'eval_steps_per_second': 3.458, 'epoch': 0.8}
{'loss': 1.8433, 'grad_norm': 1.4228135347366333, 'learning_rate': 8.867974098957016e-07, 'epoch': 0.81}
{'loss': 1.9155, 'grad_norm': 1.4317476749420166, 'learning_rate': 8.137908308113058e-07, 'epoch': 0.82}
{'loss': 1.8484, 'grad_norm': 1.9369254112243652, 'learning_rate': 7.436532703856575e-07, 'epoch': 0.82}
{'loss': 1.0917, 'grad_norm': 1.8946501016616821, 'learning_rate': 6.764327973992252e-07, 'epoch': 0.83}
{'loss': 0.9249, 'grad_norm': 1.78310227394104, 'learning_rate': 6.12175481406454e-07, 'epoch': 0.84}
{'loss': 0.9626, 'grad_norm': 2.0840466022491455, 'learning_rate': 5.509253611620019e-07, 'epoch': 0.85}
{'loss': 0.9619, 'grad_norm': 2.018683433532715, 'learning_rate': 4.92724414438771e-07, 'epoch': 0.86}
{'loss': 0.9487, 'grad_norm': 1.3101752996444702, 'learning_rate': 4.3761252925844656e-07, 'epoch': 0.87}
{'eval_loss': 1.3384125232696533, 'eval_runtime': 166.5312, 'eval_samples_per_second': 3.441, 'eval_steps_per_second': 3.441, 'epoch': 0.87}
{'loss': 2.1813, 'grad_norm': 2.0008652210235596, 'learning_rate': 3.8562747655422895e-07, 'epoch': 0.88}
{'loss': 2.5343, 'grad_norm': 1.6654351949691772, 'learning_rate': 3.3680488428453005e-07, 'epoch': 0.88}
{'loss': 1.4823, 'grad_norm': 1.8480143547058105, 'learning_rate': 2.911782130153484e-07, 'epoch': 0.89}
{'loss': 0.9216, 'grad_norm': 1.6502251625061035, 'learning_rate': 2.4877873298807033e-07, 'epoch': 0.9}
{'loss': 1.0498, 'grad_norm': 1.3823391199111938, 'learning_rate': 2.096355026884045e-07, 'epoch': 0.91}
{'loss': 1.0802, 'grad_norm': 1.466383934020996, 'learning_rate': 1.7377534893115001e-07, 'epoch': 0.92}
{'loss': 1.0971, 'grad_norm': 1.5681973695755005, 'learning_rate': 1.4122284847443713e-07, 'epoch': 0.93}
{'loss': 1.0772, 'grad_norm': 1.4816951751708984, 'learning_rate': 1.1200031117604704e-07, 'epoch': 0.93}
{'eval_loss': 1.338365077972412, 'eval_runtime': 166.1598, 'eval_samples_per_second': 3.448, 'eval_steps_per_second': 3.448, 'epoch': 0.93}
{'loss': 1.0856, 'grad_norm': 1.3942521810531616, 'learning_rate': 8.612776470334316e-08, 'epoch': 0.94}
{'loss': 1.0931, 'grad_norm': 1.62803053855896, 'learning_rate': 6.362294080731746e-08, 'epoch': 0.95}
{'loss': 1.0737, 'grad_norm': 1.4619779586791992, 'learning_rate': 4.450126317012637e-08, 'epoch': 0.96}
{'loss': 1.0926, 'grad_norm': 1.3680267333984375, 'learning_rate': 2.877583683447316e-08, 'epoch': 0.97}
{'loss': 1.0868, 'grad_norm': 1.6456063985824585, 'learning_rate': 1.6457439222065662e-08, 'epoch': 0.97}
{'loss': 1.0446, 'grad_norm': 1.5694900751113892, 'learning_rate': 7.55451274731034e-09, 'epoch': 0.98}
{'loss': 1.0509, 'grad_norm': 1.4506360292434692, 'learning_rate': 2.073159031301675e-09, 'epoch': 0.99}
{'loss': 1.0545, 'grad_norm': 1.3517605066299438, 'learning_rate': 1.7134720076139588e-11, 'epoch': 1.0}
{'eval_loss': 1.3383928537368774, 'eval_runtime': 165.9421, 'eval_samples_per_second': 3.453, 'eval_steps_per_second': 3.453, 'epoch': 1.0}
{'train_runtime': 3524.9921, 'train_samples_per_second': 0.34, 'train_steps_per_second': 0.34, 'train_tokens_per_second': 348.597, 'train_loss': 1.4365745460987092, 'epoch': 1.0}
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 1200/1200 [58:44<00:00, 2.94s/it]
--- total: 3542.8606696128845 s
--- evaluation --- eval_problem_0 eval_problem_1 eval_problem_2 eval_problem_3 eval_problem_4 eval_problem_5 eval_problem_6 eval_problem_7 eval_problem_8 eval_problem_9 eval_problem_10 eval_problem_11 eval_problem_12 eval_problem_13 eval_problem_14 eval_problem_15 eval_problem_16 eval_problem_17 eval_problem_18 eval_problem_19 eval_problem_20 eval_problem_21 eval_problem_22 eval_problem_23 eval_problem_24 eval_problem_25 eval_problem_26 eval_problem_27 eval_problem_28 eval_problem_29 eval_problem_30 eval_problem_31 eval_problem_32 eval_problem_33 eval_problem_34 eval_problem_35 eval_problem_36 eval_problem_37 eval_problem_38 eval_problem_39 eval_problem_40 eval_problem_41 eval_problem_42 eval_problem_43 eval_problem_44 eval_problem_45 eval_problem_46 eval_problem_47 eval_problem_48 eval_problem_49 --- total: 2808.219149827957 s