YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

train set

TRAIN_DATASET = "sayakpaul/hf-codegen-v2" TRAIN_BASE_MODEL = "meta-llama/Llama-3.2-1B"

TRAIN_MODEL_HUB_NAME = "steve329" TRAIN_TUNED_MODEL_NAME = "test_fine_llama3_qlora_all_3" TRAIN_PEFT_LAYERS = "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" TRAIN_LORA_ALPHA = 16 TRAIN_LORA_DROPOUT = 0.05 TRAIN_LORA_R = 8

TRAIN_SAMPLES = 1000 TRAIN_VAL_SAMPLES = 10 # about 30 steps per val sample TRAIN_SEQ_LEN = 1024 TRAIN_MAX_STEPS = 400 TRAIN_VAL_STEPS = 80 TRAIN_SAVE_STEPS = 1000 TRAIN_LOG_STEPS = 10 TRAIN_TRAIN_BSIZE = 1 TRAIN_VAL_BSIZE = 1 TRAIN_LR = 5e-5 TRAIN_LR_TYPE = "cosine" TRAIN_WARMUP_STEPS = 0 TRAIN_FP16 = True TRAIN_BF16 = False TRAIN_ACCU_STEPS = 1 TRAIN_WEIGHT_DECAY = 0.01

test set

TEST_NUM_SAMPLES_PER_TASK = 15 TEST_NUM_TASK_IDS = 50 TEST_MAX_NEW_TOEKNS = 512

TEST_TEMPERATURE = 0.2 TEST_TOP_K = 50 TEST_TOP_P = 0.95 TEST_DO_SAMPLE = True TEST_REPETITION_PENALTY = 1.0

--- tuning --- trainable params: 5,636,096 || all params: 1,241,450,496 || trainable%: 0.4540 Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 103/103 [00:00<00:00, 258.99it/s] max_steps is given, it will override any value given in num_train_epochs 0%| | 0/400 [00:00<?, ?it/s]The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16. {'loss': 1.8204, 'grad_norm': 1.3328591585159302, 'learning_rate': 4.99229333433282e-05, 'epoch': 0.03}
{'loss': 1.2134, 'grad_norm': 1.060073733329773, 'learning_rate': 4.9692208514878444e-05, 'epoch': 0.05}
{'loss': 1.0953, 'grad_norm': 1.7300605773925781, 'learning_rate': 4.9309248009941914e-05, 'epoch': 0.07}
{'loss': 1.0424, 'grad_norm': 2.2006287574768066, 'learning_rate': 4.877641290737884e-05, 'epoch': 0.1}
{'loss': 1.4761, 'grad_norm': 2.6896536350250244, 'learning_rate': 4.8096988312782174e-05, 'epoch': 0.12}
{'loss': 1.9322, 'grad_norm': 1.9243583679199219, 'learning_rate': 4.72751631047092e-05, 'epoch': 0.15}
{'loss': 1.484, 'grad_norm': 2.2710037231445312, 'learning_rate': 4.6316004108852305e-05, 'epoch': 0.17}
{'loss': 1.047, 'grad_norm': 2.4178426265716553, 'learning_rate': 4.522542485937369e-05, 'epoch': 0.2}
20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 80/400 [01:09<04:34, 1.17it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (199878 > 131072). Running this sequence through the model will result in indexing errors {'eval_loss': 1.3107608556747437, 'eval_runtime': 152.7934, 'eval_samples_per_second': 3.357, 'eval_steps_per_second': 3.357, 'epoch': 0.2} {'loss': 1.7164, 'grad_norm': 1.8474045991897583, 'learning_rate': 4.401014914000078e-05, 'epoch': 0.23}
{'loss': 1.8529, 'grad_norm': 2.2887606620788574, 'learning_rate': 4.267766952966369e-05, 'epoch': 0.25}
{'loss': 1.1273, 'grad_norm': 2.1084353923797607, 'learning_rate': 4.123620120825459e-05, 'epoch': 0.28}
{'loss': 0.6202, 'grad_norm': 1.7139441967010498, 'learning_rate': 3.969463130731183e-05, 'epoch': 0.3}
{'loss': 0.6318, 'grad_norm': 2.10313081741333, 'learning_rate': 3.8062464117898724e-05, 'epoch': 0.33}
{'loss': 1.7454, 'grad_norm': 2.7863190174102783, 'learning_rate': 3.634976249348867e-05, 'epoch': 0.35}
{'loss': 1.2459, 'grad_norm': 3.6123578548431396, 'learning_rate': 3.456708580912725e-05, 'epoch': 0.38}
{'loss': 1.2299, 'grad_norm': 2.599963903427124, 'learning_rate': 3.272542485937369e-05, 'epoch': 0.4}
{'eval_loss': 1.1106401681900024, 'eval_runtime': 151.9897, 'eval_samples_per_second': 3.375, 'eval_steps_per_second': 3.375, 'epoch': 0.4} {'loss': 0.4903, 'grad_norm': 2.36421275138855, 'learning_rate': 3.083613409639764e-05, 'epoch': 0.42}
{'loss': 1.1203, 'grad_norm': 3.1779987812042236, 'learning_rate': 2.8910861626005776e-05, 'epoch': 0.45}
{'loss': 1.1351, 'grad_norm': 2.7350029945373535, 'learning_rate': 2.6961477393196126e-05, 'epoch': 0.47}
{'loss': 1.2146, 'grad_norm': 2.880659341812134, 'learning_rate': 2.5e-05, 'epoch': 0.5}
{'loss': 1.0687, 'grad_norm': 2.37646222114563, 'learning_rate': 2.303852260680388e-05, 'epoch': 0.53}
{'loss': 1.1508, 'grad_norm': 2.899869203567505, 'learning_rate': 2.1283189152576925e-05, 'epoch': 0.55}
{'loss': 0.8881, 'grad_norm': 2.137705087661743, 'learning_rate': 1.935496832827241e-05, 'epoch': 0.57}
{'loss': 0.3349, 'grad_norm': 1.7815237045288086, 'learning_rate': 1.7648991869192405e-05, 'epoch': 0.6}
{'eval_loss': 0.9981011152267456, 'eval_runtime': 152.1964, 'eval_samples_per_second': 3.371, 'eval_steps_per_second': 3.371, 'epoch': 0.6} {'loss': 1.0572, 'grad_norm': 2.96177339553833, 'learning_rate': 1.5796886182883053e-05, 'epoch': 0.62}
{'loss': 1.0081, 'grad_norm': 2.4514148235321045, 'learning_rate': 1.4001520753602121e-05, 'epoch': 0.65}
{'loss': 1.2058, 'grad_norm': 2.4870169162750244, 'learning_rate': 1.2273964606240718e-05, 'epoch': 0.68}
{'loss': 0.7643, 'grad_norm': 2.348313331604004, 'learning_rate': 1.0624868698918045e-05, 'epoch': 0.7}
{'loss': 1.0911, 'grad_norm': 3.0336053371429443, 'learning_rate': 9.064400256282757e-06, 'epoch': 0.72}
{'loss': 1.0361, 'grad_norm': 2.483652114868164, 'learning_rate': 7.602180085192143e-06, 'epoch': 0.75}
{'loss': 1.4973, 'grad_norm': 2.5220589637756348, 'learning_rate': 6.247223259238511e-06, 'epoch': 0.78}
{'loss': 1.166, 'grad_norm': 1.9077987670898438, 'learning_rate': 5.007883537822736e-06, 'epoch': 0.8}
{'eval_loss': 0.9547628164291382, 'eval_runtime': 153.3023, 'eval_samples_per_second': 3.346, 'eval_steps_per_second': 3.346, 'epoch': 0.8} {'loss': 0.2757, 'grad_norm': 2.0077965259552, 'learning_rate': 3.891801862449629e-06, 'epoch': 0.82}
{'loss': 0.3701, 'grad_norm': 1.6974297761917114, 'learning_rate': 2.9058592477826636e-06, 'epoch': 0.85}
{'loss': 0.3, 'grad_norm': 1.4839575290679932, 'learning_rate': 2.0561343579004715e-06, 'epoch': 0.88}
{'loss': 0.3358, 'grad_norm': 2.096834182739258, 'learning_rate': 1.3478660293113676e-06, 'epoch': 0.9}
{'loss': 1.0767, 'grad_norm': 4.288107872009277, 'learning_rate': 7.854209717842231e-07, 'epoch': 0.93}
{'loss': 1.1035, 'grad_norm': 3.2066102027893066, 'learning_rate': 3.7226684613065333e-07, 'epoch': 0.95}
{'loss': 0.7382, 'grad_norm': 1.637189269065857, 'learning_rate': 1.109508849230001e-07, 'epoch': 0.97}
{'loss': 0.888, 'grad_norm': 3.4519171714782715, 'learning_rate': 3.0841879584853073e-09, 'epoch': 1.0}
{'eval_loss': 0.954106867313385, 'eval_runtime': 153.0076, 'eval_samples_per_second': 3.353, 'eval_steps_per_second': 3.353, 'epoch': 1.0} {'train_runtime': 1109.9308, 'train_samples_per_second': 0.36, 'train_steps_per_second': 0.36, 'train_tokens_per_second': 369.032, 'train_loss': 1.0649387103319168, 'epoch': 1.0} 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 400/400 [18:29<00:00, 2.77s/it] --- total: 1128.085711479187 s --- evaluation --- eval_problem_0 eval_problem_1 eval_problem_2 eval_problem_3 eval_problem_4 eval_problem_5 eval_problem_6 eval_problem_7 eval_problem_8 eval_problem_9 eval_problem_10 eval_problem_11 eval_problem_12 eval_problem_13 eval_problem_14 eval_problem_15 eval_problem_16 eval_problem_17 eval_problem_18 eval_problem_19 eval_problem_20 eval_problem_21 eval_problem_22 eval_problem_23 eval_problem_24 eval_problem_25 eval_problem_26 eval_problem_27 eval_problem_28 eval_problem_29 eval_problem_30 eval_problem_31 eval_problem_32 eval_problem_33 eval_problem_34 eval_problem_35 eval_problem_36 eval_problem_37 eval_problem_38 eval_problem_39 eval_problem_40 eval_problem_41 eval_problem_42 eval_problem_43 eval_problem_44 eval_problem_45 eval_problem_46 eval_problem_47 eval_problem_48 eval_problem_49 --- total: 3349.682242155075 s

Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. πŸ™‹ Ask for provider support