{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10725, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002331002331002331, "grad_norm": 2.2309207419995793, "learning_rate": 4.655493482309125e-07, "loss": 0.868, "num_tokens": 1310720.0, "step": 5 }, { "epoch": 0.004662004662004662, "grad_norm": 2.1118896401046383, "learning_rate": 9.31098696461825e-07, "loss": 0.8785, "num_tokens": 2621440.0, "step": 10 }, { "epoch": 0.006993006993006993, "grad_norm": 1.5940427530885184, "learning_rate": 1.3966480446927373e-06, "loss": 0.8483, "num_tokens": 3932160.0, "step": 15 }, { "epoch": 0.009324009324009324, "grad_norm": 1.2314768273947896, "learning_rate": 1.86219739292365e-06, "loss": 0.8081, "num_tokens": 5242880.0, "step": 20 }, { "epoch": 0.011655011655011656, "grad_norm": 1.1773776679892092, "learning_rate": 2.3277467411545626e-06, "loss": 0.81, "num_tokens": 6553600.0, "step": 25 }, { "epoch": 0.013986013986013986, "grad_norm": 0.956055714801497, "learning_rate": 2.7932960893854746e-06, "loss": 0.8035, "num_tokens": 7864320.0, "step": 30 }, { "epoch": 0.016317016317016316, "grad_norm": 0.7062278575164957, "learning_rate": 3.2588454376163876e-06, "loss": 0.7718, "num_tokens": 9154672.0, "step": 35 }, { "epoch": 0.018648018648018648, "grad_norm": 0.772524803320663, "learning_rate": 3.7243947858473e-06, "loss": 0.7217, "num_tokens": 10465392.0, "step": 40 }, { "epoch": 0.02097902097902098, "grad_norm": 0.5893728153785672, "learning_rate": 4.189944134078212e-06, "loss": 0.7274, "num_tokens": 11776112.0, "step": 45 }, { "epoch": 0.023310023310023312, "grad_norm": 0.5065797141230348, "learning_rate": 4.655493482309125e-06, "loss": 0.7234, "num_tokens": 13086832.0, "step": 50 }, { "epoch": 0.02564102564102564, "grad_norm": 0.48651817735023045, "learning_rate": 5.121042830540038e-06, "loss": 0.7267, "num_tokens": 14397552.0, "step": 55 }, { "epoch": 0.027972027972027972, "grad_norm": 0.4552730080037496, "learning_rate": 5.586592178770949e-06, "loss": 0.6927, "num_tokens": 15708272.0, "step": 60 }, { "epoch": 0.030303030303030304, "grad_norm": 0.46005771739470563, "learning_rate": 6.052141527001862e-06, "loss": 0.6771, "num_tokens": 17018992.0, "step": 65 }, { "epoch": 0.03263403263403263, "grad_norm": 0.4299236849439654, "learning_rate": 6.517690875232775e-06, "loss": 0.6663, "num_tokens": 18329712.0, "step": 70 }, { "epoch": 0.03496503496503497, "grad_norm": 0.4516492360855498, "learning_rate": 6.983240223463687e-06, "loss": 0.6653, "num_tokens": 19640432.0, "step": 75 }, { "epoch": 0.037296037296037296, "grad_norm": 0.4149060424751554, "learning_rate": 7.4487895716946e-06, "loss": 0.6742, "num_tokens": 20951152.0, "step": 80 }, { "epoch": 0.039627039627039624, "grad_norm": 0.4609610250520845, "learning_rate": 7.914338919925513e-06, "loss": 0.6415, "num_tokens": 22261872.0, "step": 85 }, { "epoch": 0.04195804195804196, "grad_norm": 0.43308223843066856, "learning_rate": 8.379888268156424e-06, "loss": 0.6748, "num_tokens": 23572592.0, "step": 90 }, { "epoch": 0.04428904428904429, "grad_norm": 0.4477929543500944, "learning_rate": 8.845437616387337e-06, "loss": 0.6767, "num_tokens": 24883312.0, "step": 95 }, { "epoch": 0.046620046620046623, "grad_norm": 0.43548039599877864, "learning_rate": 9.31098696461825e-06, "loss": 0.6415, "num_tokens": 26194032.0, "step": 100 }, { "epoch": 0.04895104895104895, "grad_norm": 0.49046398552976145, "learning_rate": 9.776536312849161e-06, "loss": 0.6372, "num_tokens": 27504752.0, "step": 105 }, { "epoch": 0.05128205128205128, "grad_norm": 0.4895199721021838, "learning_rate": 1.0242085661080076e-05, "loss": 0.6416, "num_tokens": 28815472.0, "step": 110 }, { "epoch": 0.053613053613053616, "grad_norm": 0.46201645981993494, "learning_rate": 1.0707635009310987e-05, "loss": 0.6493, "num_tokens": 30126192.0, "step": 115 }, { "epoch": 0.055944055944055944, "grad_norm": 0.46502125769455865, "learning_rate": 1.1173184357541899e-05, "loss": 0.6521, "num_tokens": 31436912.0, "step": 120 }, { "epoch": 0.05827505827505827, "grad_norm": 0.4531752952796131, "learning_rate": 1.1638733705772813e-05, "loss": 0.6332, "num_tokens": 32747632.0, "step": 125 }, { "epoch": 0.06060606060606061, "grad_norm": 0.4905266086400909, "learning_rate": 1.2104283054003724e-05, "loss": 0.6285, "num_tokens": 34058352.0, "step": 130 }, { "epoch": 0.06293706293706294, "grad_norm": 0.47150776268903466, "learning_rate": 1.2569832402234637e-05, "loss": 0.6219, "num_tokens": 35355669.0, "step": 135 }, { "epoch": 0.06526806526806526, "grad_norm": 0.46418902222985564, "learning_rate": 1.303538175046555e-05, "loss": 0.6285, "num_tokens": 36666389.0, "step": 140 }, { "epoch": 0.0675990675990676, "grad_norm": 0.5360827942508652, "learning_rate": 1.3500931098696462e-05, "loss": 0.6311, "num_tokens": 37977109.0, "step": 145 }, { "epoch": 0.06993006993006994, "grad_norm": 0.4574856423555115, "learning_rate": 1.3966480446927374e-05, "loss": 0.6265, "num_tokens": 39287829.0, "step": 150 }, { "epoch": 0.07226107226107226, "grad_norm": 0.509429199061911, "learning_rate": 1.4432029795158286e-05, "loss": 0.6212, "num_tokens": 40598549.0, "step": 155 }, { "epoch": 0.07459207459207459, "grad_norm": 0.47709542958562123, "learning_rate": 1.48975791433892e-05, "loss": 0.6225, "num_tokens": 41909269.0, "step": 160 }, { "epoch": 0.07692307692307693, "grad_norm": 0.49691106283476394, "learning_rate": 1.5363128491620113e-05, "loss": 0.6349, "num_tokens": 43219989.0, "step": 165 }, { "epoch": 0.07925407925407925, "grad_norm": 0.5573190120950272, "learning_rate": 1.5828677839851026e-05, "loss": 0.637, "num_tokens": 44530709.0, "step": 170 }, { "epoch": 0.08158508158508158, "grad_norm": 0.49003264776858235, "learning_rate": 1.6294227188081936e-05, "loss": 0.6199, "num_tokens": 45841429.0, "step": 175 }, { "epoch": 0.08391608391608392, "grad_norm": 0.48723276020731904, "learning_rate": 1.675977653631285e-05, "loss": 0.6223, "num_tokens": 47152149.0, "step": 180 }, { "epoch": 0.08624708624708624, "grad_norm": 0.5590552496737182, "learning_rate": 1.7225325884543765e-05, "loss": 0.6203, "num_tokens": 48462869.0, "step": 185 }, { "epoch": 0.08857808857808858, "grad_norm": 0.5019716267640132, "learning_rate": 1.7690875232774675e-05, "loss": 0.6099, "num_tokens": 49773589.0, "step": 190 }, { "epoch": 0.09090909090909091, "grad_norm": 0.48128033207729953, "learning_rate": 1.8156424581005588e-05, "loss": 0.591, "num_tokens": 51058347.0, "step": 195 }, { "epoch": 0.09324009324009325, "grad_norm": 0.48039162413529596, "learning_rate": 1.86219739292365e-05, "loss": 0.6084, "num_tokens": 52353719.0, "step": 200 }, { "epoch": 0.09557109557109557, "grad_norm": 0.48115713986962244, "learning_rate": 1.9087523277467413e-05, "loss": 0.6106, "num_tokens": 53658551.0, "step": 205 }, { "epoch": 0.0979020979020979, "grad_norm": 0.4870180321160556, "learning_rate": 1.9553072625698323e-05, "loss": 0.6173, "num_tokens": 54952917.0, "step": 210 }, { "epoch": 0.10023310023310024, "grad_norm": 0.5094823620103848, "learning_rate": 2.001862197392924e-05, "loss": 0.6098, "num_tokens": 56263637.0, "step": 215 }, { "epoch": 0.10256410256410256, "grad_norm": 0.5013089386317249, "learning_rate": 2.0484171322160152e-05, "loss": 0.6096, "num_tokens": 57574357.0, "step": 220 }, { "epoch": 0.1048951048951049, "grad_norm": 0.5232640161264629, "learning_rate": 2.0949720670391062e-05, "loss": 0.6201, "num_tokens": 58885077.0, "step": 225 }, { "epoch": 0.10722610722610723, "grad_norm": 0.550970201278118, "learning_rate": 2.1415270018621975e-05, "loss": 0.593, "num_tokens": 60191231.0, "step": 230 }, { "epoch": 0.10955710955710955, "grad_norm": 0.6029709013121757, "learning_rate": 2.1880819366852888e-05, "loss": 0.6074, "num_tokens": 61491882.0, "step": 235 }, { "epoch": 0.11188811188811189, "grad_norm": 0.5486381526391435, "learning_rate": 2.2346368715083797e-05, "loss": 0.6098, "num_tokens": 62792124.0, "step": 240 }, { "epoch": 0.11421911421911422, "grad_norm": 0.6116198358074656, "learning_rate": 2.2811918063314713e-05, "loss": 0.5992, "num_tokens": 64102844.0, "step": 245 }, { "epoch": 0.11655011655011654, "grad_norm": 0.6390765452401985, "learning_rate": 2.3277467411545626e-05, "loss": 0.5884, "num_tokens": 65405812.0, "step": 250 }, { "epoch": 0.11888111888111888, "grad_norm": 0.5592180145233874, "learning_rate": 2.3743016759776536e-05, "loss": 0.6115, "num_tokens": 66716532.0, "step": 255 }, { "epoch": 0.12121212121212122, "grad_norm": 0.5867882274444552, "learning_rate": 2.420856610800745e-05, "loss": 0.6322, "num_tokens": 68027252.0, "step": 260 }, { "epoch": 0.12354312354312354, "grad_norm": 0.4978541728996833, "learning_rate": 2.4674115456238362e-05, "loss": 0.6125, "num_tokens": 69337972.0, "step": 265 }, { "epoch": 0.1258741258741259, "grad_norm": 0.5176243116469658, "learning_rate": 2.5139664804469275e-05, "loss": 0.5889, "num_tokens": 70648692.0, "step": 270 }, { "epoch": 0.1282051282051282, "grad_norm": 0.5205723612646859, "learning_rate": 2.5605214152700184e-05, "loss": 0.5806, "num_tokens": 71949722.0, "step": 275 }, { "epoch": 0.13053613053613053, "grad_norm": 0.5105297626501796, "learning_rate": 2.60707635009311e-05, "loss": 0.5903, "num_tokens": 73260442.0, "step": 280 }, { "epoch": 0.13286713286713286, "grad_norm": 0.5983979159514128, "learning_rate": 2.6536312849162014e-05, "loss": 0.572, "num_tokens": 74571162.0, "step": 285 }, { "epoch": 0.1351981351981352, "grad_norm": 0.5344572430585598, "learning_rate": 2.7001862197392923e-05, "loss": 0.611, "num_tokens": 75881882.0, "step": 290 }, { "epoch": 0.13752913752913754, "grad_norm": 0.4971065663534153, "learning_rate": 2.746741154562384e-05, "loss": 0.584, "num_tokens": 77192602.0, "step": 295 }, { "epoch": 0.13986013986013987, "grad_norm": 0.47253775531322106, "learning_rate": 2.793296089385475e-05, "loss": 0.6098, "num_tokens": 78503322.0, "step": 300 }, { "epoch": 0.14219114219114218, "grad_norm": 0.5871484700261882, "learning_rate": 2.8398510242085662e-05, "loss": 0.5873, "num_tokens": 79814042.0, "step": 305 }, { "epoch": 0.1445221445221445, "grad_norm": 0.5397162657687059, "learning_rate": 2.886405959031657e-05, "loss": 0.5601, "num_tokens": 81124762.0, "step": 310 }, { "epoch": 0.14685314685314685, "grad_norm": 0.5003299556137542, "learning_rate": 2.9329608938547488e-05, "loss": 0.5765, "num_tokens": 82416836.0, "step": 315 }, { "epoch": 0.14918414918414918, "grad_norm": 0.5745464582996782, "learning_rate": 2.97951582867784e-05, "loss": 0.6041, "num_tokens": 83727556.0, "step": 320 }, { "epoch": 0.15151515151515152, "grad_norm": 0.6004282699112464, "learning_rate": 3.026070763500931e-05, "loss": 0.5826, "num_tokens": 85038276.0, "step": 325 }, { "epoch": 0.15384615384615385, "grad_norm": 0.6113197906754673, "learning_rate": 3.0726256983240227e-05, "loss": 0.599, "num_tokens": 86348996.0, "step": 330 }, { "epoch": 0.1561771561771562, "grad_norm": 0.5284281986815572, "learning_rate": 3.1191806331471136e-05, "loss": 0.594, "num_tokens": 87657694.0, "step": 335 }, { "epoch": 0.1585081585081585, "grad_norm": 0.6592854288333747, "learning_rate": 3.165735567970205e-05, "loss": 0.5971, "num_tokens": 88959058.0, "step": 340 }, { "epoch": 0.16083916083916083, "grad_norm": 0.6611677002486963, "learning_rate": 3.212290502793296e-05, "loss": 0.5869, "num_tokens": 90269778.0, "step": 345 }, { "epoch": 0.16317016317016317, "grad_norm": 0.5569884136454792, "learning_rate": 3.258845437616387e-05, "loss": 0.5721, "num_tokens": 91580498.0, "step": 350 }, { "epoch": 0.1655011655011655, "grad_norm": 0.5645452564927658, "learning_rate": 3.305400372439479e-05, "loss": 0.6015, "num_tokens": 92891218.0, "step": 355 }, { "epoch": 0.16783216783216784, "grad_norm": 0.6374940933275999, "learning_rate": 3.35195530726257e-05, "loss": 0.5844, "num_tokens": 94201938.0, "step": 360 }, { "epoch": 0.17016317016317017, "grad_norm": 0.6011646060068315, "learning_rate": 3.3985102420856614e-05, "loss": 0.5875, "num_tokens": 95507207.0, "step": 365 }, { "epoch": 0.17249417249417248, "grad_norm": 0.5884433838979773, "learning_rate": 3.445065176908753e-05, "loss": 0.5751, "num_tokens": 96817927.0, "step": 370 }, { "epoch": 0.17482517482517482, "grad_norm": 0.6051679815251396, "learning_rate": 3.491620111731844e-05, "loss": 0.5858, "num_tokens": 98128647.0, "step": 375 }, { "epoch": 0.17715617715617715, "grad_norm": 0.6049709145754255, "learning_rate": 3.538175046554935e-05, "loss": 0.5924, "num_tokens": 99432200.0, "step": 380 }, { "epoch": 0.1794871794871795, "grad_norm": 0.4705325840654263, "learning_rate": 3.584729981378026e-05, "loss": 0.5848, "num_tokens": 100738580.0, "step": 385 }, { "epoch": 0.18181818181818182, "grad_norm": 0.6888947351263296, "learning_rate": 3.6312849162011175e-05, "loss": 0.5765, "num_tokens": 102049300.0, "step": 390 }, { "epoch": 0.18414918414918416, "grad_norm": 0.5752502148380462, "learning_rate": 3.6778398510242085e-05, "loss": 0.5789, "num_tokens": 103360020.0, "step": 395 }, { "epoch": 0.1864801864801865, "grad_norm": 0.6432121990501041, "learning_rate": 3.7243947858473e-05, "loss": 0.5997, "num_tokens": 104670740.0, "step": 400 }, { "epoch": 0.1888111888111888, "grad_norm": 0.5905703187842473, "learning_rate": 3.770949720670392e-05, "loss": 0.5887, "num_tokens": 105981460.0, "step": 405 }, { "epoch": 0.19114219114219114, "grad_norm": 0.6634303714634665, "learning_rate": 3.817504655493483e-05, "loss": 0.5745, "num_tokens": 107287613.0, "step": 410 }, { "epoch": 0.19347319347319347, "grad_norm": 0.6069673456722715, "learning_rate": 3.8640595903165736e-05, "loss": 0.6034, "num_tokens": 108598333.0, "step": 415 }, { "epoch": 0.1958041958041958, "grad_norm": 0.6162680847616665, "learning_rate": 3.9106145251396646e-05, "loss": 0.5891, "num_tokens": 109909053.0, "step": 420 }, { "epoch": 0.19813519813519814, "grad_norm": 0.5857092916452535, "learning_rate": 3.957169459962756e-05, "loss": 0.5789, "num_tokens": 111219773.0, "step": 425 }, { "epoch": 0.20046620046620048, "grad_norm": 0.6647569324831807, "learning_rate": 4.003724394785848e-05, "loss": 0.606, "num_tokens": 112530493.0, "step": 430 }, { "epoch": 0.20279720279720279, "grad_norm": 0.5524426568425298, "learning_rate": 4.050279329608939e-05, "loss": 0.5825, "num_tokens": 113841213.0, "step": 435 }, { "epoch": 0.20512820512820512, "grad_norm": 0.5810994370146649, "learning_rate": 4.0968342644320304e-05, "loss": 0.6003, "num_tokens": 115151933.0, "step": 440 }, { "epoch": 0.20745920745920746, "grad_norm": 0.5641669807873702, "learning_rate": 4.143389199255121e-05, "loss": 0.5916, "num_tokens": 116454721.0, "step": 445 }, { "epoch": 0.2097902097902098, "grad_norm": 0.5150156412806575, "learning_rate": 4.1899441340782123e-05, "loss": 0.5836, "num_tokens": 117765441.0, "step": 450 }, { "epoch": 0.21212121212121213, "grad_norm": 0.47565520542426254, "learning_rate": 4.236499068901304e-05, "loss": 0.5875, "num_tokens": 119076161.0, "step": 455 }, { "epoch": 0.21445221445221446, "grad_norm": 0.5296345463311969, "learning_rate": 4.283054003724395e-05, "loss": 0.5875, "num_tokens": 120376968.0, "step": 460 }, { "epoch": 0.21678321678321677, "grad_norm": 0.5322255322866658, "learning_rate": 4.3296089385474866e-05, "loss": 0.5675, "num_tokens": 121685949.0, "step": 465 }, { "epoch": 0.2191142191142191, "grad_norm": 0.5124276326443321, "learning_rate": 4.3761638733705775e-05, "loss": 0.5834, "num_tokens": 122996669.0, "step": 470 }, { "epoch": 0.22144522144522144, "grad_norm": 0.5493568540280628, "learning_rate": 4.4227188081936685e-05, "loss": 0.5699, "num_tokens": 124296690.0, "step": 475 }, { "epoch": 0.22377622377622378, "grad_norm": 0.5080116188657589, "learning_rate": 4.4692737430167594e-05, "loss": 0.5859, "num_tokens": 125607410.0, "step": 480 }, { "epoch": 0.2261072261072261, "grad_norm": 0.4796592051544516, "learning_rate": 4.515828677839851e-05, "loss": 0.5707, "num_tokens": 126904766.0, "step": 485 }, { "epoch": 0.22843822843822845, "grad_norm": 0.6183745714886669, "learning_rate": 4.562383612662943e-05, "loss": 0.5921, "num_tokens": 128215486.0, "step": 490 }, { "epoch": 0.23076923076923078, "grad_norm": 0.5361760011420288, "learning_rate": 4.6089385474860336e-05, "loss": 0.5669, "num_tokens": 129521193.0, "step": 495 }, { "epoch": 0.2331002331002331, "grad_norm": 0.635330676879503, "learning_rate": 4.655493482309125e-05, "loss": 0.5884, "num_tokens": 130831913.0, "step": 500 }, { "epoch": 0.23543123543123542, "grad_norm": 0.7485046943209333, "learning_rate": 4.702048417132216e-05, "loss": 0.5685, "num_tokens": 132142633.0, "step": 505 }, { "epoch": 0.23776223776223776, "grad_norm": 0.48973907523525306, "learning_rate": 4.748603351955307e-05, "loss": 0.5748, "num_tokens": 133447076.0, "step": 510 }, { "epoch": 0.2400932400932401, "grad_norm": 0.501560835222231, "learning_rate": 4.795158286778399e-05, "loss": 0.591, "num_tokens": 134742243.0, "step": 515 }, { "epoch": 0.24242424242424243, "grad_norm": 0.5768660260790422, "learning_rate": 4.84171322160149e-05, "loss": 0.5918, "num_tokens": 136036466.0, "step": 520 }, { "epoch": 0.24475524475524477, "grad_norm": 0.5577461974387155, "learning_rate": 4.8882681564245814e-05, "loss": 0.5881, "num_tokens": 137347186.0, "step": 525 }, { "epoch": 0.24708624708624707, "grad_norm": 0.511973917006169, "learning_rate": 4.9348230912476724e-05, "loss": 0.5767, "num_tokens": 138657906.0, "step": 530 }, { "epoch": 0.2494172494172494, "grad_norm": 0.5058707953577455, "learning_rate": 4.981378026070764e-05, "loss": 0.5768, "num_tokens": 139963192.0, "step": 535 }, { "epoch": 0.2517482517482518, "grad_norm": 0.5486438929823766, "learning_rate": 4.999999037242581e-05, "loss": 0.5832, "num_tokens": 141273912.0, "step": 540 }, { "epoch": 0.2540792540792541, "grad_norm": 0.5099572474285374, "learning_rate": 4.999993153728008e-05, "loss": 0.5689, "num_tokens": 142570091.0, "step": 545 }, { "epoch": 0.2564102564102564, "grad_norm": 0.5750075114680016, "learning_rate": 4.9999819215780634e-05, "loss": 0.5811, "num_tokens": 143880811.0, "step": 550 }, { "epoch": 0.25874125874125875, "grad_norm": 0.6134226971370147, "learning_rate": 4.9999653408194474e-05, "loss": 0.5843, "num_tokens": 145191531.0, "step": 555 }, { "epoch": 0.26107226107226106, "grad_norm": 0.6314862123358536, "learning_rate": 4.999943411491576e-05, "loss": 0.5793, "num_tokens": 146502251.0, "step": 560 }, { "epoch": 0.2634032634032634, "grad_norm": 0.6889606086375162, "learning_rate": 4.9999161336465794e-05, "loss": 0.5702, "num_tokens": 147812971.0, "step": 565 }, { "epoch": 0.26573426573426573, "grad_norm": 0.6217160853993534, "learning_rate": 4.999883507349302e-05, "loss": 0.5774, "num_tokens": 149113471.0, "step": 570 }, { "epoch": 0.2680652680652681, "grad_norm": 0.8603569158115519, "learning_rate": 4.9998455326773e-05, "loss": 0.5723, "num_tokens": 150424191.0, "step": 575 }, { "epoch": 0.2703962703962704, "grad_norm": 0.5345785403032188, "learning_rate": 4.9998022097208494e-05, "loss": 0.5841, "num_tokens": 151734911.0, "step": 580 }, { "epoch": 0.2727272727272727, "grad_norm": 0.4897793835702214, "learning_rate": 4.9997535385829355e-05, "loss": 0.5847, "num_tokens": 153045631.0, "step": 585 }, { "epoch": 0.27505827505827507, "grad_norm": 0.6403908386621698, "learning_rate": 4.9996995193792575e-05, "loss": 0.5852, "num_tokens": 154356351.0, "step": 590 }, { "epoch": 0.2773892773892774, "grad_norm": 0.5209712741969316, "learning_rate": 4.9996401522382285e-05, "loss": 0.5581, "num_tokens": 155667071.0, "step": 595 }, { "epoch": 0.27972027972027974, "grad_norm": 0.4869182742809078, "learning_rate": 4.9995754373009756e-05, "loss": 0.5818, "num_tokens": 156977308.0, "step": 600 }, { "epoch": 0.28205128205128205, "grad_norm": 0.5217363999507327, "learning_rate": 4.999505374721338e-05, "loss": 0.568, "num_tokens": 158288028.0, "step": 605 }, { "epoch": 0.28438228438228436, "grad_norm": 0.5636140387216821, "learning_rate": 4.999429964665866e-05, "loss": 0.5685, "num_tokens": 159598748.0, "step": 610 }, { "epoch": 0.2867132867132867, "grad_norm": 0.4953249441318155, "learning_rate": 4.999349207313823e-05, "loss": 0.5569, "num_tokens": 160909468.0, "step": 615 }, { "epoch": 0.289044289044289, "grad_norm": 0.5870745218369171, "learning_rate": 4.999263102857185e-05, "loss": 0.5684, "num_tokens": 162220188.0, "step": 620 }, { "epoch": 0.2913752913752914, "grad_norm": 0.5796420642176587, "learning_rate": 4.9991716515006354e-05, "loss": 0.5908, "num_tokens": 163517758.0, "step": 625 }, { "epoch": 0.2937062937062937, "grad_norm": 0.5238869041431976, "learning_rate": 4.9990748534615714e-05, "loss": 0.5591, "num_tokens": 164828478.0, "step": 630 }, { "epoch": 0.29603729603729606, "grad_norm": 0.5863277078576777, "learning_rate": 4.998972708970101e-05, "loss": 0.5777, "num_tokens": 166123691.0, "step": 635 }, { "epoch": 0.29836829836829837, "grad_norm": 0.5171070705654046, "learning_rate": 4.998865218269036e-05, "loss": 0.5659, "num_tokens": 167423794.0, "step": 640 }, { "epoch": 0.3006993006993007, "grad_norm": 0.6049960425262351, "learning_rate": 4.998752381613905e-05, "loss": 0.5683, "num_tokens": 168734514.0, "step": 645 }, { "epoch": 0.30303030303030304, "grad_norm": 0.4913193380088962, "learning_rate": 4.998634199272939e-05, "loss": 0.5561, "num_tokens": 170045234.0, "step": 650 }, { "epoch": 0.30536130536130535, "grad_norm": 0.47269645394182036, "learning_rate": 4.9985106715270786e-05, "loss": 0.5509, "num_tokens": 171355954.0, "step": 655 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5606565456686575, "learning_rate": 4.99838179866997e-05, "loss": 0.5639, "num_tokens": 172657586.0, "step": 660 }, { "epoch": 0.31002331002331, "grad_norm": 0.5304938940189576, "learning_rate": 4.99824758100797e-05, "loss": 0.5512, "num_tokens": 173968306.0, "step": 665 }, { "epoch": 0.3123543123543124, "grad_norm": 0.4909731279417892, "learning_rate": 4.998108018860136e-05, "loss": 0.5729, "num_tokens": 175279026.0, "step": 670 }, { "epoch": 0.3146853146853147, "grad_norm": 0.5316113973406738, "learning_rate": 4.997963112558232e-05, "loss": 0.5679, "num_tokens": 176589746.0, "step": 675 }, { "epoch": 0.317016317016317, "grad_norm": 0.5548933976438383, "learning_rate": 4.9978128624467266e-05, "loss": 0.5559, "num_tokens": 177900466.0, "step": 680 }, { "epoch": 0.31934731934731936, "grad_norm": 0.6354292278890509, "learning_rate": 4.997657268882791e-05, "loss": 0.569, "num_tokens": 179211186.0, "step": 685 }, { "epoch": 0.32167832167832167, "grad_norm": 0.5118276377254981, "learning_rate": 4.9974963322362986e-05, "loss": 0.575, "num_tokens": 180509493.0, "step": 690 }, { "epoch": 0.32400932400932403, "grad_norm": 0.5775683157667488, "learning_rate": 4.997330052889826e-05, "loss": 0.5627, "num_tokens": 181820213.0, "step": 695 }, { "epoch": 0.32634032634032634, "grad_norm": 0.5342512966303329, "learning_rate": 4.9971584312386467e-05, "loss": 0.5616, "num_tokens": 183130933.0, "step": 700 }, { "epoch": 0.32867132867132864, "grad_norm": 0.4679024175601337, "learning_rate": 4.996981467690738e-05, "loss": 0.5549, "num_tokens": 184441653.0, "step": 705 }, { "epoch": 0.331002331002331, "grad_norm": 0.46007737133219345, "learning_rate": 4.9967991626667726e-05, "loss": 0.5709, "num_tokens": 185752373.0, "step": 710 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5781752361746175, "learning_rate": 4.996611516600122e-05, "loss": 0.5705, "num_tokens": 187063093.0, "step": 715 }, { "epoch": 0.3356643356643357, "grad_norm": 0.505350299094054, "learning_rate": 4.996418529936855e-05, "loss": 0.5488, "num_tokens": 188373813.0, "step": 720 }, { "epoch": 0.337995337995338, "grad_norm": 0.5228405284991805, "learning_rate": 4.9962202031357356e-05, "loss": 0.5719, "num_tokens": 189684342.0, "step": 725 }, { "epoch": 0.34032634032634035, "grad_norm": 0.5260336277261016, "learning_rate": 4.996016536668221e-05, "loss": 0.5723, "num_tokens": 190995062.0, "step": 730 }, { "epoch": 0.34265734265734266, "grad_norm": 0.5197547644486562, "learning_rate": 4.9958075310184634e-05, "loss": 0.5769, "num_tokens": 192305782.0, "step": 735 }, { "epoch": 0.34498834498834496, "grad_norm": 0.4760010257555004, "learning_rate": 4.995593186683308e-05, "loss": 0.5504, "num_tokens": 193616502.0, "step": 740 }, { "epoch": 0.3473193473193473, "grad_norm": 0.5832438782265436, "learning_rate": 4.995373504172286e-05, "loss": 0.5709, "num_tokens": 194927222.0, "step": 745 }, { "epoch": 0.34965034965034963, "grad_norm": 0.4426080176608608, "learning_rate": 4.9951484840076246e-05, "loss": 0.56, "num_tokens": 196237942.0, "step": 750 }, { "epoch": 0.351981351981352, "grad_norm": 0.5864066431584307, "learning_rate": 4.9949181267242365e-05, "loss": 0.5494, "num_tokens": 197548662.0, "step": 755 }, { "epoch": 0.3543123543123543, "grad_norm": 0.6479586979194639, "learning_rate": 4.994682432869722e-05, "loss": 0.548, "num_tokens": 198859382.0, "step": 760 }, { "epoch": 0.35664335664335667, "grad_norm": 0.5890555644210004, "learning_rate": 4.994441403004366e-05, "loss": 0.5513, "num_tokens": 200170102.0, "step": 765 }, { "epoch": 0.358974358974359, "grad_norm": 0.5160808856165031, "learning_rate": 4.9941950377011424e-05, "loss": 0.5554, "num_tokens": 201480822.0, "step": 770 }, { "epoch": 0.3613053613053613, "grad_norm": 0.46297814176374613, "learning_rate": 4.993943337545703e-05, "loss": 0.5607, "num_tokens": 202791542.0, "step": 775 }, { "epoch": 0.36363636363636365, "grad_norm": 0.5611193032764832, "learning_rate": 4.993686303136385e-05, "loss": 0.5539, "num_tokens": 204102262.0, "step": 780 }, { "epoch": 0.36596736596736595, "grad_norm": 0.5066631995519579, "learning_rate": 4.9934239350842064e-05, "loss": 0.5613, "num_tokens": 205412982.0, "step": 785 }, { "epoch": 0.3682983682983683, "grad_norm": 0.5659611475992192, "learning_rate": 4.99315623401286e-05, "loss": 0.5613, "num_tokens": 206723702.0, "step": 790 }, { "epoch": 0.3706293706293706, "grad_norm": 0.48541130012045497, "learning_rate": 4.992883200558721e-05, "loss": 0.5534, "num_tokens": 208034422.0, "step": 795 }, { "epoch": 0.372960372960373, "grad_norm": 0.5007321223818056, "learning_rate": 4.992604835370838e-05, "loss": 0.5676, "num_tokens": 209345142.0, "step": 800 }, { "epoch": 0.3752913752913753, "grad_norm": 0.49768356226116434, "learning_rate": 4.992321139110935e-05, "loss": 0.5628, "num_tokens": 210655862.0, "step": 805 }, { "epoch": 0.3776223776223776, "grad_norm": 0.49652952690828717, "learning_rate": 4.992032112453409e-05, "loss": 0.5602, "num_tokens": 211966582.0, "step": 810 }, { "epoch": 0.37995337995337997, "grad_norm": 0.4649469721716684, "learning_rate": 4.9917377560853265e-05, "loss": 0.5545, "num_tokens": 213277302.0, "step": 815 }, { "epoch": 0.3822843822843823, "grad_norm": 0.5360683146657782, "learning_rate": 4.991438070706428e-05, "loss": 0.5519, "num_tokens": 214586037.0, "step": 820 }, { "epoch": 0.38461538461538464, "grad_norm": 0.49705599435755293, "learning_rate": 4.991133057029116e-05, "loss": 0.5509, "num_tokens": 215896757.0, "step": 825 }, { "epoch": 0.38694638694638694, "grad_norm": 0.5283918826785868, "learning_rate": 4.9908227157784645e-05, "loss": 0.5391, "num_tokens": 217207477.0, "step": 830 }, { "epoch": 0.38927738927738925, "grad_norm": 0.5007115483504386, "learning_rate": 4.9905070476922086e-05, "loss": 0.5639, "num_tokens": 218509916.0, "step": 835 }, { "epoch": 0.3916083916083916, "grad_norm": 0.5498454232202491, "learning_rate": 4.9901860535207486e-05, "loss": 0.5705, "num_tokens": 219820636.0, "step": 840 }, { "epoch": 0.3939393939393939, "grad_norm": 0.48240740398938314, "learning_rate": 4.9898597340271446e-05, "loss": 0.5368, "num_tokens": 221131356.0, "step": 845 }, { "epoch": 0.3962703962703963, "grad_norm": 0.5056929824942472, "learning_rate": 4.989528089987117e-05, "loss": 0.5575, "num_tokens": 222442076.0, "step": 850 }, { "epoch": 0.3986013986013986, "grad_norm": 0.489254628262671, "learning_rate": 4.989191122189042e-05, "loss": 0.5493, "num_tokens": 223752796.0, "step": 855 }, { "epoch": 0.40093240093240096, "grad_norm": 0.48008396663558006, "learning_rate": 4.988848831433952e-05, "loss": 0.5428, "num_tokens": 225063516.0, "step": 860 }, { "epoch": 0.40326340326340326, "grad_norm": 0.49333444429559375, "learning_rate": 4.9885012185355346e-05, "loss": 0.5481, "num_tokens": 226374236.0, "step": 865 }, { "epoch": 0.40559440559440557, "grad_norm": 0.45124369244739004, "learning_rate": 4.9881482843201266e-05, "loss": 0.555, "num_tokens": 227684956.0, "step": 870 }, { "epoch": 0.40792540792540793, "grad_norm": 0.5496711964879529, "learning_rate": 4.987790029626716e-05, "loss": 0.5616, "num_tokens": 228995676.0, "step": 875 }, { "epoch": 0.41025641025641024, "grad_norm": 0.47265434607763146, "learning_rate": 4.9874264553069376e-05, "loss": 0.5386, "num_tokens": 230306396.0, "step": 880 }, { "epoch": 0.4125874125874126, "grad_norm": 0.5135497697717332, "learning_rate": 4.987057562225074e-05, "loss": 0.5603, "num_tokens": 231617116.0, "step": 885 }, { "epoch": 0.4149184149184149, "grad_norm": 0.4682122711297366, "learning_rate": 4.986683351258048e-05, "loss": 0.5445, "num_tokens": 232927836.0, "step": 890 }, { "epoch": 0.4172494172494173, "grad_norm": 0.4112633329315492, "learning_rate": 4.986303823295427e-05, "loss": 0.5426, "num_tokens": 234238556.0, "step": 895 }, { "epoch": 0.4195804195804196, "grad_norm": 0.402214665476133, "learning_rate": 4.985918979239416e-05, "loss": 0.5485, "num_tokens": 235549276.0, "step": 900 }, { "epoch": 0.4219114219114219, "grad_norm": 0.5455511517804665, "learning_rate": 4.985528820004859e-05, "loss": 0.557, "num_tokens": 236859996.0, "step": 905 }, { "epoch": 0.42424242424242425, "grad_norm": 0.47199199317580776, "learning_rate": 4.9851333465192336e-05, "loss": 0.5371, "num_tokens": 238170716.0, "step": 910 }, { "epoch": 0.42657342657342656, "grad_norm": 0.4776585972671657, "learning_rate": 4.984732559722651e-05, "loss": 0.555, "num_tokens": 239481436.0, "step": 915 }, { "epoch": 0.4289044289044289, "grad_norm": 0.5249113633053311, "learning_rate": 4.984326460567852e-05, "loss": 0.5629, "num_tokens": 240792156.0, "step": 920 }, { "epoch": 0.43123543123543123, "grad_norm": 0.5202213780622079, "learning_rate": 4.9839150500202085e-05, "loss": 0.5443, "num_tokens": 242102876.0, "step": 925 }, { "epoch": 0.43356643356643354, "grad_norm": 0.5456374996972472, "learning_rate": 4.983498329057715e-05, "loss": 0.5597, "num_tokens": 243413596.0, "step": 930 }, { "epoch": 0.4358974358974359, "grad_norm": 0.4380920966683162, "learning_rate": 4.983076298670994e-05, "loss": 0.5325, "num_tokens": 244719166.0, "step": 935 }, { "epoch": 0.4382284382284382, "grad_norm": 0.5542937783335792, "learning_rate": 4.982648959863285e-05, "loss": 0.5562, "num_tokens": 246029886.0, "step": 940 }, { "epoch": 0.4405594405594406, "grad_norm": 0.503124227495007, "learning_rate": 4.982216313650448e-05, "loss": 0.554, "num_tokens": 247327205.0, "step": 945 }, { "epoch": 0.4428904428904429, "grad_norm": 0.5660923842068369, "learning_rate": 4.981778361060962e-05, "loss": 0.5592, "num_tokens": 248637925.0, "step": 950 }, { "epoch": 0.44522144522144524, "grad_norm": 0.46034726240926843, "learning_rate": 4.981335103135919e-05, "loss": 0.5484, "num_tokens": 249948645.0, "step": 955 }, { "epoch": 0.44755244755244755, "grad_norm": 0.46499481242052637, "learning_rate": 4.980886540929021e-05, "loss": 0.5432, "num_tokens": 251259365.0, "step": 960 }, { "epoch": 0.44988344988344986, "grad_norm": 0.5139881063742346, "learning_rate": 4.98043267550658e-05, "loss": 0.5609, "num_tokens": 252570085.0, "step": 965 }, { "epoch": 0.4522144522144522, "grad_norm": 0.44368184026909635, "learning_rate": 4.979973507947516e-05, "loss": 0.5372, "num_tokens": 253880805.0, "step": 970 }, { "epoch": 0.45454545454545453, "grad_norm": 0.45865519207556255, "learning_rate": 4.979509039343352e-05, "loss": 0.559, "num_tokens": 255191525.0, "step": 975 }, { "epoch": 0.4568764568764569, "grad_norm": 0.5353248215200734, "learning_rate": 4.9790392707982137e-05, "loss": 0.5715, "num_tokens": 256502245.0, "step": 980 }, { "epoch": 0.4592074592074592, "grad_norm": 0.4173868436003061, "learning_rate": 4.978564203428823e-05, "loss": 0.5447, "num_tokens": 257812965.0, "step": 985 }, { "epoch": 0.46153846153846156, "grad_norm": 0.5116076701150912, "learning_rate": 4.9780838383645007e-05, "loss": 0.5551, "num_tokens": 259123685.0, "step": 990 }, { "epoch": 0.46386946386946387, "grad_norm": 0.46300754955199347, "learning_rate": 4.977598176747161e-05, "loss": 0.539, "num_tokens": 260425724.0, "step": 995 }, { "epoch": 0.4662004662004662, "grad_norm": 0.5076641753208481, "learning_rate": 4.977107219731307e-05, "loss": 0.5526, "num_tokens": 261736444.0, "step": 1000 }, { "epoch": 0.46853146853146854, "grad_norm": 0.47654605410882744, "learning_rate": 4.9766109684840316e-05, "loss": 0.5507, "num_tokens": 263047164.0, "step": 1005 }, { "epoch": 0.47086247086247085, "grad_norm": 0.47216898809671914, "learning_rate": 4.9761094241850137e-05, "loss": 0.5564, "num_tokens": 264341546.0, "step": 1010 }, { "epoch": 0.4731934731934732, "grad_norm": 0.522140635226013, "learning_rate": 4.9756025880265124e-05, "loss": 0.5583, "num_tokens": 265652266.0, "step": 1015 }, { "epoch": 0.4755244755244755, "grad_norm": 0.48172818261748473, "learning_rate": 4.975090461213368e-05, "loss": 0.5534, "num_tokens": 266962986.0, "step": 1020 }, { "epoch": 0.47785547785547783, "grad_norm": 0.43774729877649815, "learning_rate": 4.9745730449629967e-05, "loss": 0.5398, "num_tokens": 268273706.0, "step": 1025 }, { "epoch": 0.4801864801864802, "grad_norm": 0.45666010168372156, "learning_rate": 4.9740503405053904e-05, "loss": 0.558, "num_tokens": 269584426.0, "step": 1030 }, { "epoch": 0.4825174825174825, "grad_norm": 0.45349675110750093, "learning_rate": 4.9735223490831104e-05, "loss": 0.5558, "num_tokens": 270895146.0, "step": 1035 }, { "epoch": 0.48484848484848486, "grad_norm": 0.4811125107950563, "learning_rate": 4.9729890719512875e-05, "loss": 0.5332, "num_tokens": 272205866.0, "step": 1040 }, { "epoch": 0.48717948717948717, "grad_norm": 0.48607706780099336, "learning_rate": 4.972450510377615e-05, "loss": 0.5511, "num_tokens": 273514547.0, "step": 1045 }, { "epoch": 0.48951048951048953, "grad_norm": 0.4988992952928454, "learning_rate": 4.971906665642351e-05, "loss": 0.5509, "num_tokens": 274825267.0, "step": 1050 }, { "epoch": 0.49184149184149184, "grad_norm": 0.45178980549967424, "learning_rate": 4.971357539038311e-05, "loss": 0.5352, "num_tokens": 276135501.0, "step": 1055 }, { "epoch": 0.49417249417249415, "grad_norm": 0.4628244844080021, "learning_rate": 4.970803131870867e-05, "loss": 0.5576, "num_tokens": 277446221.0, "step": 1060 }, { "epoch": 0.4965034965034965, "grad_norm": 0.4968199670572577, "learning_rate": 4.9702434454579435e-05, "loss": 0.5302, "num_tokens": 278751167.0, "step": 1065 }, { "epoch": 0.4988344988344988, "grad_norm": 0.5350366333592432, "learning_rate": 4.969678481130017e-05, "loss": 0.5447, "num_tokens": 280061887.0, "step": 1070 }, { "epoch": 0.5011655011655012, "grad_norm": 0.4428901687128577, "learning_rate": 4.9691082402301056e-05, "loss": 0.5515, "num_tokens": 281372607.0, "step": 1075 }, { "epoch": 0.5034965034965035, "grad_norm": 0.4375390965832773, "learning_rate": 4.9685327241137755e-05, "loss": 0.5429, "num_tokens": 282683327.0, "step": 1080 }, { "epoch": 0.5058275058275058, "grad_norm": 0.4522043765274002, "learning_rate": 4.967951934149132e-05, "loss": 0.5603, "num_tokens": 283980719.0, "step": 1085 }, { "epoch": 0.5081585081585082, "grad_norm": 0.45069894232095004, "learning_rate": 4.967365871716814e-05, "loss": 0.5528, "num_tokens": 285291439.0, "step": 1090 }, { "epoch": 0.5104895104895105, "grad_norm": 0.4750493200396816, "learning_rate": 4.9667745382099986e-05, "loss": 0.533, "num_tokens": 286602159.0, "step": 1095 }, { "epoch": 0.5128205128205128, "grad_norm": 0.4549131138548312, "learning_rate": 4.96617793503439e-05, "loss": 0.5531, "num_tokens": 287912879.0, "step": 1100 }, { "epoch": 0.5151515151515151, "grad_norm": 0.5019297246106439, "learning_rate": 4.9655760636082214e-05, "loss": 0.5749, "num_tokens": 289215063.0, "step": 1105 }, { "epoch": 0.5174825174825175, "grad_norm": 0.4113411292647171, "learning_rate": 4.964968925362248e-05, "loss": 0.5372, "num_tokens": 290525783.0, "step": 1110 }, { "epoch": 0.5198135198135199, "grad_norm": 0.41397626069442495, "learning_rate": 4.964356521739746e-05, "loss": 0.5385, "num_tokens": 291823567.0, "step": 1115 }, { "epoch": 0.5221445221445221, "grad_norm": 0.44284424686828006, "learning_rate": 4.9637388541965074e-05, "loss": 0.5346, "num_tokens": 293122806.0, "step": 1120 }, { "epoch": 0.5244755244755245, "grad_norm": 0.47707957535562784, "learning_rate": 4.9631159242008394e-05, "loss": 0.5411, "num_tokens": 294423907.0, "step": 1125 }, { "epoch": 0.5268065268065268, "grad_norm": 0.5044770815753655, "learning_rate": 4.9624877332335576e-05, "loss": 0.5675, "num_tokens": 295734627.0, "step": 1130 }, { "epoch": 0.5291375291375291, "grad_norm": 0.5774114892004122, "learning_rate": 4.9618542827879826e-05, "loss": 0.5546, "num_tokens": 297045347.0, "step": 1135 }, { "epoch": 0.5314685314685315, "grad_norm": 0.4376278267424837, "learning_rate": 4.9612155743699416e-05, "loss": 0.5377, "num_tokens": 298356067.0, "step": 1140 }, { "epoch": 0.5337995337995338, "grad_norm": 0.5642480010358203, "learning_rate": 4.960571609497756e-05, "loss": 0.5576, "num_tokens": 299666787.0, "step": 1145 }, { "epoch": 0.5361305361305362, "grad_norm": 0.46779318091035216, "learning_rate": 4.9599223897022474e-05, "loss": 0.5292, "num_tokens": 300977507.0, "step": 1150 }, { "epoch": 0.5384615384615384, "grad_norm": 0.5065887547269632, "learning_rate": 4.959267916526726e-05, "loss": 0.5493, "num_tokens": 302288227.0, "step": 1155 }, { "epoch": 0.5407925407925408, "grad_norm": 0.4516510903602445, "learning_rate": 4.958608191526992e-05, "loss": 0.5392, "num_tokens": 303598947.0, "step": 1160 }, { "epoch": 0.5431235431235432, "grad_norm": 0.4860574849415324, "learning_rate": 4.957943216271328e-05, "loss": 0.5479, "num_tokens": 304909667.0, "step": 1165 }, { "epoch": 0.5454545454545454, "grad_norm": 0.4464946298053418, "learning_rate": 4.9572729923405e-05, "loss": 0.5459, "num_tokens": 306213321.0, "step": 1170 }, { "epoch": 0.5477855477855478, "grad_norm": 0.4873945912917641, "learning_rate": 4.956597521327751e-05, "loss": 0.5616, "num_tokens": 307524041.0, "step": 1175 }, { "epoch": 0.5501165501165501, "grad_norm": 0.4388018067262825, "learning_rate": 4.955916804838794e-05, "loss": 0.5423, "num_tokens": 308834761.0, "step": 1180 }, { "epoch": 0.5524475524475524, "grad_norm": 0.5105304916507707, "learning_rate": 4.955230844491815e-05, "loss": 0.5437, "num_tokens": 310145481.0, "step": 1185 }, { "epoch": 0.5547785547785548, "grad_norm": 0.4456654310545227, "learning_rate": 4.954539641917464e-05, "loss": 0.522, "num_tokens": 311456201.0, "step": 1190 }, { "epoch": 0.5571095571095571, "grad_norm": 0.4454183246164721, "learning_rate": 4.953843198758853e-05, "loss": 0.5404, "num_tokens": 312766921.0, "step": 1195 }, { "epoch": 0.5594405594405595, "grad_norm": 0.4348791044569224, "learning_rate": 4.953141516671551e-05, "loss": 0.543, "num_tokens": 314077641.0, "step": 1200 }, { "epoch": 0.5617715617715617, "grad_norm": 0.4380820952282363, "learning_rate": 4.952434597323582e-05, "loss": 0.5396, "num_tokens": 315388361.0, "step": 1205 }, { "epoch": 0.5641025641025641, "grad_norm": 0.5026501223667641, "learning_rate": 4.9517224423954207e-05, "loss": 0.5347, "num_tokens": 316699081.0, "step": 1210 }, { "epoch": 0.5664335664335665, "grad_norm": 0.44043504602766326, "learning_rate": 4.951005053579985e-05, "loss": 0.5339, "num_tokens": 317998833.0, "step": 1215 }, { "epoch": 0.5687645687645687, "grad_norm": 0.4743068657928212, "learning_rate": 4.950282432582635e-05, "loss": 0.5339, "num_tokens": 319309553.0, "step": 1220 }, { "epoch": 0.5710955710955711, "grad_norm": 0.47052451377323873, "learning_rate": 4.9495545811211724e-05, "loss": 0.5294, "num_tokens": 320613226.0, "step": 1225 }, { "epoch": 0.5734265734265734, "grad_norm": 0.4514576116741238, "learning_rate": 4.948821500925829e-05, "loss": 0.5322, "num_tokens": 321923946.0, "step": 1230 }, { "epoch": 0.5757575757575758, "grad_norm": 0.499267629747507, "learning_rate": 4.948083193739267e-05, "loss": 0.5288, "num_tokens": 323234666.0, "step": 1235 }, { "epoch": 0.578088578088578, "grad_norm": 0.41784291038069327, "learning_rate": 4.947339661316574e-05, "loss": 0.5412, "num_tokens": 324545386.0, "step": 1240 }, { "epoch": 0.5804195804195804, "grad_norm": 0.4039329790155304, "learning_rate": 4.946590905425262e-05, "loss": 0.5417, "num_tokens": 325856106.0, "step": 1245 }, { "epoch": 0.5827505827505828, "grad_norm": 0.47163444684620975, "learning_rate": 4.9458369278452536e-05, "loss": 0.5312, "num_tokens": 327166826.0, "step": 1250 }, { "epoch": 0.585081585081585, "grad_norm": 0.49628607779772915, "learning_rate": 4.94507773036889e-05, "loss": 0.5646, "num_tokens": 328477546.0, "step": 1255 }, { "epoch": 0.5874125874125874, "grad_norm": 0.47690969125101035, "learning_rate": 4.9443133148009193e-05, "loss": 0.5458, "num_tokens": 329788266.0, "step": 1260 }, { "epoch": 0.5897435897435898, "grad_norm": 0.5529225791199783, "learning_rate": 4.943543682958494e-05, "loss": 0.5515, "num_tokens": 331098986.0, "step": 1265 }, { "epoch": 0.5920745920745921, "grad_norm": 0.4570156847193979, "learning_rate": 4.942768836671165e-05, "loss": 0.5624, "num_tokens": 332409706.0, "step": 1270 }, { "epoch": 0.5944055944055944, "grad_norm": 0.4476694941858805, "learning_rate": 4.941988777780881e-05, "loss": 0.5278, "num_tokens": 333720426.0, "step": 1275 }, { "epoch": 0.5967365967365967, "grad_norm": 0.5092541231473139, "learning_rate": 4.941203508141982e-05, "loss": 0.541, "num_tokens": 335031146.0, "step": 1280 }, { "epoch": 0.5990675990675991, "grad_norm": 0.49506494033393816, "learning_rate": 4.940413029621193e-05, "loss": 0.5176, "num_tokens": 336338731.0, "step": 1285 }, { "epoch": 0.6013986013986014, "grad_norm": 0.4380679777553619, "learning_rate": 4.939617344097622e-05, "loss": 0.5303, "num_tokens": 337649451.0, "step": 1290 }, { "epoch": 0.6037296037296037, "grad_norm": 0.48586508532838385, "learning_rate": 4.938816453462758e-05, "loss": 0.536, "num_tokens": 338960171.0, "step": 1295 }, { "epoch": 0.6060606060606061, "grad_norm": 0.530127091401037, "learning_rate": 4.9380103596204584e-05, "loss": 0.5227, "num_tokens": 340270891.0, "step": 1300 }, { "epoch": 0.6083916083916084, "grad_norm": 0.4723471427584443, "learning_rate": 4.9371990644869534e-05, "loss": 0.5364, "num_tokens": 341578590.0, "step": 1305 }, { "epoch": 0.6107226107226107, "grad_norm": 0.4745273455014607, "learning_rate": 4.936382569990837e-05, "loss": 0.5294, "num_tokens": 342889310.0, "step": 1310 }, { "epoch": 0.6130536130536131, "grad_norm": 0.4155339618600452, "learning_rate": 4.935560878073061e-05, "loss": 0.5167, "num_tokens": 344200030.0, "step": 1315 }, { "epoch": 0.6153846153846154, "grad_norm": 0.4782114553800321, "learning_rate": 4.934733990686934e-05, "loss": 0.5185, "num_tokens": 345504904.0, "step": 1320 }, { "epoch": 0.6177156177156177, "grad_norm": 0.44089991433891135, "learning_rate": 4.9339019097981155e-05, "loss": 0.5533, "num_tokens": 346815624.0, "step": 1325 }, { "epoch": 0.62004662004662, "grad_norm": 0.43436432584179596, "learning_rate": 4.933064637384611e-05, "loss": 0.5159, "num_tokens": 348126344.0, "step": 1330 }, { "epoch": 0.6223776223776224, "grad_norm": 0.43330361810730905, "learning_rate": 4.932222175436764e-05, "loss": 0.5162, "num_tokens": 349437064.0, "step": 1335 }, { "epoch": 0.6247086247086248, "grad_norm": 0.49741869953347956, "learning_rate": 4.9313745259572594e-05, "loss": 0.539, "num_tokens": 350734169.0, "step": 1340 }, { "epoch": 0.627039627039627, "grad_norm": 0.4766273272639524, "learning_rate": 4.93052169096111e-05, "loss": 0.5331, "num_tokens": 352031299.0, "step": 1345 }, { "epoch": 0.6293706293706294, "grad_norm": 0.4354355175881609, "learning_rate": 4.9296636724756576e-05, "loss": 0.5616, "num_tokens": 353342019.0, "step": 1350 }, { "epoch": 0.6317016317016317, "grad_norm": 0.4509143900260654, "learning_rate": 4.928800472540564e-05, "loss": 0.5162, "num_tokens": 354652739.0, "step": 1355 }, { "epoch": 0.634032634032634, "grad_norm": 0.48892330783121585, "learning_rate": 4.9279320932078114e-05, "loss": 0.5432, "num_tokens": 355956520.0, "step": 1360 }, { "epoch": 0.6363636363636364, "grad_norm": 0.49979045808989403, "learning_rate": 4.927058536541691e-05, "loss": 0.5421, "num_tokens": 357259308.0, "step": 1365 }, { "epoch": 0.6386946386946387, "grad_norm": 0.4773067120764309, "learning_rate": 4.926179804618805e-05, "loss": 0.5232, "num_tokens": 358570028.0, "step": 1370 }, { "epoch": 0.6410256410256411, "grad_norm": 0.4807159216924898, "learning_rate": 4.925295899528052e-05, "loss": 0.5378, "num_tokens": 359880748.0, "step": 1375 }, { "epoch": 0.6433566433566433, "grad_norm": 0.5792009053156909, "learning_rate": 4.924406823370637e-05, "loss": 0.5505, "num_tokens": 361191468.0, "step": 1380 }, { "epoch": 0.6456876456876457, "grad_norm": 0.43269843498677835, "learning_rate": 4.923512578260049e-05, "loss": 0.5271, "num_tokens": 362502188.0, "step": 1385 }, { "epoch": 0.6480186480186481, "grad_norm": 0.487074695118609, "learning_rate": 4.922613166322071e-05, "loss": 0.524, "num_tokens": 363812908.0, "step": 1390 }, { "epoch": 0.6503496503496503, "grad_norm": 0.4550175361090717, "learning_rate": 4.9217085896947636e-05, "loss": 0.5314, "num_tokens": 365123628.0, "step": 1395 }, { "epoch": 0.6526806526806527, "grad_norm": 0.47974397984295003, "learning_rate": 4.920798850528468e-05, "loss": 0.5467, "num_tokens": 366434348.0, "step": 1400 }, { "epoch": 0.655011655011655, "grad_norm": 0.48000475609552506, "learning_rate": 4.919883950985796e-05, "loss": 0.5284, "num_tokens": 367745068.0, "step": 1405 }, { "epoch": 0.6573426573426573, "grad_norm": 0.6241647775105474, "learning_rate": 4.918963893241628e-05, "loss": 0.5464, "num_tokens": 369055788.0, "step": 1410 }, { "epoch": 0.6596736596736597, "grad_norm": 0.48926526915624463, "learning_rate": 4.918038679483105e-05, "loss": 0.5331, "num_tokens": 370366508.0, "step": 1415 }, { "epoch": 0.662004662004662, "grad_norm": 0.4456502453067459, "learning_rate": 4.917108311909624e-05, "loss": 0.5525, "num_tokens": 371677228.0, "step": 1420 }, { "epoch": 0.6643356643356644, "grad_norm": 0.4346779313930937, "learning_rate": 4.916172792732838e-05, "loss": 0.5191, "num_tokens": 372987948.0, "step": 1425 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5028199232030454, "learning_rate": 4.91523212417664e-05, "loss": 0.5355, "num_tokens": 374298668.0, "step": 1430 }, { "epoch": 0.668997668997669, "grad_norm": 0.43127888280556725, "learning_rate": 4.914286308477168e-05, "loss": 0.5402, "num_tokens": 375597975.0, "step": 1435 }, { "epoch": 0.6713286713286714, "grad_norm": 0.42701713218021103, "learning_rate": 4.913335347882795e-05, "loss": 0.5436, "num_tokens": 376908695.0, "step": 1440 }, { "epoch": 0.6736596736596736, "grad_norm": 0.45471075660929255, "learning_rate": 4.912379244654125e-05, "loss": 0.5496, "num_tokens": 378219415.0, "step": 1445 }, { "epoch": 0.675990675990676, "grad_norm": 0.5036127947007575, "learning_rate": 4.911418001063985e-05, "loss": 0.5457, "num_tokens": 379526026.0, "step": 1450 }, { "epoch": 0.6783216783216783, "grad_norm": 0.6069326047714376, "learning_rate": 4.910451619397421e-05, "loss": 0.5532, "num_tokens": 380835617.0, "step": 1455 }, { "epoch": 0.6806526806526807, "grad_norm": 0.42021839267741845, "learning_rate": 4.9094801019516987e-05, "loss": 0.5302, "num_tokens": 382146337.0, "step": 1460 }, { "epoch": 0.682983682983683, "grad_norm": 0.42181592137530005, "learning_rate": 4.908503451036285e-05, "loss": 0.5395, "num_tokens": 383457057.0, "step": 1465 }, { "epoch": 0.6853146853146853, "grad_norm": 0.4819433448161508, "learning_rate": 4.9075216689728545e-05, "loss": 0.5232, "num_tokens": 384767777.0, "step": 1470 }, { "epoch": 0.6876456876456877, "grad_norm": 0.4360769334365848, "learning_rate": 4.9065347580952795e-05, "loss": 0.5419, "num_tokens": 386078497.0, "step": 1475 }, { "epoch": 0.6899766899766899, "grad_norm": 0.46623263345856697, "learning_rate": 4.9055427207496216e-05, "loss": 0.5327, "num_tokens": 387384301.0, "step": 1480 }, { "epoch": 0.6923076923076923, "grad_norm": 0.49755651823842795, "learning_rate": 4.9045455592941325e-05, "loss": 0.5313, "num_tokens": 388695021.0, "step": 1485 }, { "epoch": 0.6946386946386947, "grad_norm": 0.47799577567341445, "learning_rate": 4.903543276099241e-05, "loss": 0.5191, "num_tokens": 390005741.0, "step": 1490 }, { "epoch": 0.696969696969697, "grad_norm": 0.43014258037668707, "learning_rate": 4.902535873547555e-05, "loss": 0.5279, "num_tokens": 391300707.0, "step": 1495 }, { "epoch": 0.6993006993006993, "grad_norm": 0.43677160663867953, "learning_rate": 4.901523354033849e-05, "loss": 0.5239, "num_tokens": 392611427.0, "step": 1500 }, { "epoch": 0.7016317016317016, "grad_norm": 0.4314496324860873, "learning_rate": 4.9005057199650624e-05, "loss": 0.5507, "num_tokens": 393915317.0, "step": 1505 }, { "epoch": 0.703962703962704, "grad_norm": 0.5211544326381605, "learning_rate": 4.8994829737602945e-05, "loss": 0.5327, "num_tokens": 395213883.0, "step": 1510 }, { "epoch": 0.7062937062937062, "grad_norm": 0.4976803079849033, "learning_rate": 4.8984551178507936e-05, "loss": 0.5281, "num_tokens": 396524603.0, "step": 1515 }, { "epoch": 0.7086247086247086, "grad_norm": 0.48985310144346317, "learning_rate": 4.897422154679959e-05, "loss": 0.5285, "num_tokens": 397835323.0, "step": 1520 }, { "epoch": 0.710955710955711, "grad_norm": 0.5059108187782707, "learning_rate": 4.896384086703327e-05, "loss": 0.5221, "num_tokens": 399146043.0, "step": 1525 }, { "epoch": 0.7132867132867133, "grad_norm": 0.4641511467601387, "learning_rate": 4.8953409163885706e-05, "loss": 0.5263, "num_tokens": 400443842.0, "step": 1530 }, { "epoch": 0.7156177156177156, "grad_norm": 0.46382018249195767, "learning_rate": 4.894292646215492e-05, "loss": 0.5295, "num_tokens": 401754562.0, "step": 1535 }, { "epoch": 0.717948717948718, "grad_norm": 0.46293693643832434, "learning_rate": 4.8932392786760174e-05, "loss": 0.5311, "num_tokens": 403065282.0, "step": 1540 }, { "epoch": 0.7202797202797203, "grad_norm": 0.4416679999163852, "learning_rate": 4.8921808162741875e-05, "loss": 0.5316, "num_tokens": 404376002.0, "step": 1545 }, { "epoch": 0.7226107226107226, "grad_norm": 0.5431939837625993, "learning_rate": 4.891117261526159e-05, "loss": 0.5232, "num_tokens": 405686722.0, "step": 1550 }, { "epoch": 0.7249417249417249, "grad_norm": 0.41054749209427666, "learning_rate": 4.890048616960189e-05, "loss": 0.5272, "num_tokens": 406997442.0, "step": 1555 }, { "epoch": 0.7272727272727273, "grad_norm": 0.4379625629492077, "learning_rate": 4.888974885116637e-05, "loss": 0.5359, "num_tokens": 408298322.0, "step": 1560 }, { "epoch": 0.7296037296037297, "grad_norm": 0.48689004241017486, "learning_rate": 4.887896068547957e-05, "loss": 0.5469, "num_tokens": 409609042.0, "step": 1565 }, { "epoch": 0.7319347319347319, "grad_norm": 0.38565120780565504, "learning_rate": 4.886812169818686e-05, "loss": 0.5409, "num_tokens": 410917246.0, "step": 1570 }, { "epoch": 0.7342657342657343, "grad_norm": 0.42695613923544207, "learning_rate": 4.8857231915054465e-05, "loss": 0.5445, "num_tokens": 412227966.0, "step": 1575 }, { "epoch": 0.7365967365967366, "grad_norm": 0.381808559474973, "learning_rate": 4.884629136196934e-05, "loss": 0.5207, "num_tokens": 413538686.0, "step": 1580 }, { "epoch": 0.7389277389277389, "grad_norm": 0.44656354419841626, "learning_rate": 4.8835300064939126e-05, "loss": 0.5172, "num_tokens": 414849406.0, "step": 1585 }, { "epoch": 0.7412587412587412, "grad_norm": 0.4096295626157741, "learning_rate": 4.88242580500921e-05, "loss": 0.5255, "num_tokens": 416160126.0, "step": 1590 }, { "epoch": 0.7435897435897436, "grad_norm": 0.4147213001212058, "learning_rate": 4.8813165343677106e-05, "loss": 0.5426, "num_tokens": 417470846.0, "step": 1595 }, { "epoch": 0.745920745920746, "grad_norm": 0.5042968381378073, "learning_rate": 4.8802021972063496e-05, "loss": 0.5351, "num_tokens": 418781566.0, "step": 1600 }, { "epoch": 0.7482517482517482, "grad_norm": 0.4572941443823439, "learning_rate": 4.879082796174104e-05, "loss": 0.5267, "num_tokens": 420090396.0, "step": 1605 }, { "epoch": 0.7505827505827506, "grad_norm": 0.5253757414894156, "learning_rate": 4.87795833393199e-05, "loss": 0.5256, "num_tokens": 421401116.0, "step": 1610 }, { "epoch": 0.752913752913753, "grad_norm": 0.4314784400622872, "learning_rate": 4.876828813153055e-05, "loss": 0.52, "num_tokens": 422711836.0, "step": 1615 }, { "epoch": 0.7552447552447552, "grad_norm": 0.43464618943674793, "learning_rate": 4.875694236522372e-05, "loss": 0.5157, "num_tokens": 424003342.0, "step": 1620 }, { "epoch": 0.7575757575757576, "grad_norm": 0.43677420380768783, "learning_rate": 4.8745546067370326e-05, "loss": 0.5305, "num_tokens": 425314062.0, "step": 1625 }, { "epoch": 0.7599067599067599, "grad_norm": 0.4958407404835606, "learning_rate": 4.873409926506139e-05, "loss": 0.5362, "num_tokens": 426624782.0, "step": 1630 }, { "epoch": 0.7622377622377622, "grad_norm": 0.5242256465945466, "learning_rate": 4.8722601985508024e-05, "loss": 0.5369, "num_tokens": 427935502.0, "step": 1635 }, { "epoch": 0.7645687645687645, "grad_norm": 0.5675275645432154, "learning_rate": 4.871105425604129e-05, "loss": 0.5422, "num_tokens": 429246222.0, "step": 1640 }, { "epoch": 0.7668997668997669, "grad_norm": 0.5159660910808933, "learning_rate": 4.869945610411222e-05, "loss": 0.5379, "num_tokens": 430556942.0, "step": 1645 }, { "epoch": 0.7692307692307693, "grad_norm": 0.45981018020284387, "learning_rate": 4.8687807557291684e-05, "loss": 0.5233, "num_tokens": 431867662.0, "step": 1650 }, { "epoch": 0.7715617715617715, "grad_norm": 0.5178606410785168, "learning_rate": 4.867610864327035e-05, "loss": 0.517, "num_tokens": 433176429.0, "step": 1655 }, { "epoch": 0.7738927738927739, "grad_norm": 0.4153211549717526, "learning_rate": 4.866435938985864e-05, "loss": 0.521, "num_tokens": 434474764.0, "step": 1660 }, { "epoch": 0.7762237762237763, "grad_norm": 0.43474480248852926, "learning_rate": 4.8652559824986614e-05, "loss": 0.5149, "num_tokens": 435785484.0, "step": 1665 }, { "epoch": 0.7785547785547785, "grad_norm": 0.40937995874484956, "learning_rate": 4.8640709976703955e-05, "loss": 0.5255, "num_tokens": 437096204.0, "step": 1670 }, { "epoch": 0.7808857808857809, "grad_norm": 0.4465967325967671, "learning_rate": 4.862880987317987e-05, "loss": 0.5322, "num_tokens": 438406924.0, "step": 1675 }, { "epoch": 0.7832167832167832, "grad_norm": 0.49435400982888844, "learning_rate": 4.8616859542703015e-05, "loss": 0.5139, "num_tokens": 439717644.0, "step": 1680 }, { "epoch": 0.7855477855477856, "grad_norm": 0.45230753156297615, "learning_rate": 4.860485901368146e-05, "loss": 0.5204, "num_tokens": 441012029.0, "step": 1685 }, { "epoch": 0.7878787878787878, "grad_norm": 0.46302825612412485, "learning_rate": 4.859280831464262e-05, "loss": 0.5307, "num_tokens": 442322749.0, "step": 1690 }, { "epoch": 0.7902097902097902, "grad_norm": 0.4352069645722489, "learning_rate": 4.858070747423315e-05, "loss": 0.5293, "num_tokens": 443633469.0, "step": 1695 }, { "epoch": 0.7925407925407926, "grad_norm": 0.44776105300683866, "learning_rate": 4.856855652121889e-05, "loss": 0.5376, "num_tokens": 444944189.0, "step": 1700 }, { "epoch": 0.7948717948717948, "grad_norm": 0.5047765930651309, "learning_rate": 4.855635548448485e-05, "loss": 0.5266, "num_tokens": 446254909.0, "step": 1705 }, { "epoch": 0.7972027972027972, "grad_norm": 0.5902940318374286, "learning_rate": 4.8544104393035064e-05, "loss": 0.5548, "num_tokens": 447549445.0, "step": 1710 }, { "epoch": 0.7995337995337995, "grad_norm": 0.47765982040228566, "learning_rate": 4.8531803275992564e-05, "loss": 0.5234, "num_tokens": 448860165.0, "step": 1715 }, { "epoch": 0.8018648018648019, "grad_norm": 0.4683409566871783, "learning_rate": 4.85194521625993e-05, "loss": 0.5303, "num_tokens": 450170885.0, "step": 1720 }, { "epoch": 0.8041958041958042, "grad_norm": 0.3875193157237528, "learning_rate": 4.850705108221607e-05, "loss": 0.5243, "num_tokens": 451481605.0, "step": 1725 }, { "epoch": 0.8065268065268065, "grad_norm": 0.4090714505669396, "learning_rate": 4.849460006432246e-05, "loss": 0.5368, "num_tokens": 452792325.0, "step": 1730 }, { "epoch": 0.8088578088578089, "grad_norm": 0.41342814545426965, "learning_rate": 4.848209913851676e-05, "loss": 0.5367, "num_tokens": 454103045.0, "step": 1735 }, { "epoch": 0.8111888111888111, "grad_norm": 0.43002723270421467, "learning_rate": 4.8469548334515895e-05, "loss": 0.5128, "num_tokens": 455413765.0, "step": 1740 }, { "epoch": 0.8135198135198135, "grad_norm": 0.44536700540815066, "learning_rate": 4.845694768215538e-05, "loss": 0.5225, "num_tokens": 456724485.0, "step": 1745 }, { "epoch": 0.8158508158508159, "grad_norm": 0.41417616843988325, "learning_rate": 4.844429721138921e-05, "loss": 0.5179, "num_tokens": 458035205.0, "step": 1750 }, { "epoch": 0.8181818181818182, "grad_norm": 0.4335240853933784, "learning_rate": 4.843159695228981e-05, "loss": 0.5338, "num_tokens": 459345925.0, "step": 1755 }, { "epoch": 0.8205128205128205, "grad_norm": 0.4804531520417236, "learning_rate": 4.841884693504796e-05, "loss": 0.5301, "num_tokens": 460656645.0, "step": 1760 }, { "epoch": 0.8228438228438228, "grad_norm": 0.426207694644574, "learning_rate": 4.8406047189972745e-05, "loss": 0.512, "num_tokens": 461967365.0, "step": 1765 }, { "epoch": 0.8251748251748252, "grad_norm": 0.5170006552608741, "learning_rate": 4.839319774749142e-05, "loss": 0.5439, "num_tokens": 463278085.0, "step": 1770 }, { "epoch": 0.8275058275058275, "grad_norm": 0.41296058440902, "learning_rate": 4.8380298638149414e-05, "loss": 0.529, "num_tokens": 464588805.0, "step": 1775 }, { "epoch": 0.8298368298368298, "grad_norm": 0.4432327702578238, "learning_rate": 4.8367349892610205e-05, "loss": 0.5141, "num_tokens": 465899525.0, "step": 1780 }, { "epoch": 0.8321678321678322, "grad_norm": 0.4186473579130964, "learning_rate": 4.8354351541655295e-05, "loss": 0.5056, "num_tokens": 467210245.0, "step": 1785 }, { "epoch": 0.8344988344988346, "grad_norm": 0.4085487401716384, "learning_rate": 4.834130361618407e-05, "loss": 0.5201, "num_tokens": 468520965.0, "step": 1790 }, { "epoch": 0.8368298368298368, "grad_norm": 0.41696496844476233, "learning_rate": 4.832820614721377e-05, "loss": 0.5182, "num_tokens": 469831685.0, "step": 1795 }, { "epoch": 0.8391608391608392, "grad_norm": 0.49106045483752714, "learning_rate": 4.8315059165879424e-05, "loss": 0.5053, "num_tokens": 471142405.0, "step": 1800 }, { "epoch": 0.8414918414918415, "grad_norm": 0.48437176577007535, "learning_rate": 4.830186270343375e-05, "loss": 0.5168, "num_tokens": 472453125.0, "step": 1805 }, { "epoch": 0.8438228438228438, "grad_norm": 0.40932065591417777, "learning_rate": 4.828861679124711e-05, "loss": 0.5381, "num_tokens": 473763845.0, "step": 1810 }, { "epoch": 0.8461538461538461, "grad_norm": 0.5194673863667205, "learning_rate": 4.827532146080738e-05, "loss": 0.5299, "num_tokens": 475074565.0, "step": 1815 }, { "epoch": 0.8484848484848485, "grad_norm": 0.41189276085541665, "learning_rate": 4.826197674371995e-05, "loss": 0.5107, "num_tokens": 476385285.0, "step": 1820 }, { "epoch": 0.8508158508158508, "grad_norm": 0.41512648167613064, "learning_rate": 4.8248582671707585e-05, "loss": 0.5182, "num_tokens": 477684249.0, "step": 1825 }, { "epoch": 0.8531468531468531, "grad_norm": 0.39429940494764987, "learning_rate": 4.8235139276610395e-05, "loss": 0.527, "num_tokens": 478994969.0, "step": 1830 }, { "epoch": 0.8554778554778555, "grad_norm": 0.4269707121617, "learning_rate": 4.8221646590385723e-05, "loss": 0.5202, "num_tokens": 480305689.0, "step": 1835 }, { "epoch": 0.8578088578088578, "grad_norm": 0.4747259759226198, "learning_rate": 4.8208104645108086e-05, "loss": 0.5163, "num_tokens": 481616409.0, "step": 1840 }, { "epoch": 0.8601398601398601, "grad_norm": 0.4214636443982996, "learning_rate": 4.819451347296912e-05, "loss": 0.5202, "num_tokens": 482927129.0, "step": 1845 }, { "epoch": 0.8624708624708625, "grad_norm": 0.4097076386562582, "learning_rate": 4.818087310627746e-05, "loss": 0.5198, "num_tokens": 484237849.0, "step": 1850 }, { "epoch": 0.8648018648018648, "grad_norm": 0.4415899349026412, "learning_rate": 4.816718357745869e-05, "loss": 0.5116, "num_tokens": 485544074.0, "step": 1855 }, { "epoch": 0.8671328671328671, "grad_norm": 0.49520322041534987, "learning_rate": 4.815344491905527e-05, "loss": 0.5268, "num_tokens": 486854794.0, "step": 1860 }, { "epoch": 0.8694638694638694, "grad_norm": 0.4013730407728088, "learning_rate": 4.813965716372644e-05, "loss": 0.5357, "num_tokens": 488165514.0, "step": 1865 }, { "epoch": 0.8717948717948718, "grad_norm": 0.3971657654693432, "learning_rate": 4.812582034424815e-05, "loss": 0.5036, "num_tokens": 489476234.0, "step": 1870 }, { "epoch": 0.8741258741258742, "grad_norm": 0.4546086981413901, "learning_rate": 4.811193449351301e-05, "loss": 0.5185, "num_tokens": 490786954.0, "step": 1875 }, { "epoch": 0.8764568764568764, "grad_norm": 0.4514734870231076, "learning_rate": 4.809799964453014e-05, "loss": 0.5285, "num_tokens": 492097674.0, "step": 1880 }, { "epoch": 0.8787878787878788, "grad_norm": 0.486652363082405, "learning_rate": 4.808401583042517e-05, "loss": 0.5214, "num_tokens": 493408394.0, "step": 1885 }, { "epoch": 0.8811188811188811, "grad_norm": 0.38657514612617594, "learning_rate": 4.806998308444014e-05, "loss": 0.5285, "num_tokens": 494719114.0, "step": 1890 }, { "epoch": 0.8834498834498834, "grad_norm": 0.43457520597462723, "learning_rate": 4.805590143993337e-05, "loss": 0.5283, "num_tokens": 496018186.0, "step": 1895 }, { "epoch": 0.8857808857808858, "grad_norm": 0.45512971801887536, "learning_rate": 4.804177093037947e-05, "loss": 0.5167, "num_tokens": 497312162.0, "step": 1900 }, { "epoch": 0.8881118881118881, "grad_norm": 0.5864459417857114, "learning_rate": 4.802759158936914e-05, "loss": 0.507, "num_tokens": 498622882.0, "step": 1905 }, { "epoch": 0.8904428904428905, "grad_norm": 0.4119135091650743, "learning_rate": 4.801336345060925e-05, "loss": 0.5075, "num_tokens": 499933602.0, "step": 1910 }, { "epoch": 0.8927738927738927, "grad_norm": 0.5607982382740162, "learning_rate": 4.79990865479226e-05, "loss": 0.5317, "num_tokens": 501244322.0, "step": 1915 }, { "epoch": 0.8951048951048951, "grad_norm": 0.4424330844089448, "learning_rate": 4.7984760915247945e-05, "loss": 0.5024, "num_tokens": 502555042.0, "step": 1920 }, { "epoch": 0.8974358974358975, "grad_norm": 0.4047970520184837, "learning_rate": 4.7970386586639867e-05, "loss": 0.4966, "num_tokens": 503865762.0, "step": 1925 }, { "epoch": 0.8997668997668997, "grad_norm": 0.44771407799642704, "learning_rate": 4.795596359626871e-05, "loss": 0.5236, "num_tokens": 505176482.0, "step": 1930 }, { "epoch": 0.9020979020979021, "grad_norm": 0.5048948938413714, "learning_rate": 4.794149197842051e-05, "loss": 0.5179, "num_tokens": 506479186.0, "step": 1935 }, { "epoch": 0.9044289044289044, "grad_norm": 0.45102633005957127, "learning_rate": 4.792697176749686e-05, "loss": 0.5329, "num_tokens": 507789906.0, "step": 1940 }, { "epoch": 0.9067599067599068, "grad_norm": 0.47896436686520744, "learning_rate": 4.791240299801492e-05, "loss": 0.5144, "num_tokens": 509100626.0, "step": 1945 }, { "epoch": 0.9090909090909091, "grad_norm": 0.44024138392686507, "learning_rate": 4.7897785704607244e-05, "loss": 0.5319, "num_tokens": 510411346.0, "step": 1950 }, { "epoch": 0.9114219114219114, "grad_norm": 0.46492535601527907, "learning_rate": 4.7883119922021744e-05, "loss": 0.5005, "num_tokens": 511720801.0, "step": 1955 }, { "epoch": 0.9137529137529138, "grad_norm": 0.47305357877273235, "learning_rate": 4.7868405685121614e-05, "loss": 0.5058, "num_tokens": 513031521.0, "step": 1960 }, { "epoch": 0.916083916083916, "grad_norm": 0.5349540175928681, "learning_rate": 4.7853643028885216e-05, "loss": 0.5259, "num_tokens": 514342241.0, "step": 1965 }, { "epoch": 0.9184149184149184, "grad_norm": 0.4744283556978438, "learning_rate": 4.783883198840601e-05, "loss": 0.5247, "num_tokens": 515652961.0, "step": 1970 }, { "epoch": 0.9207459207459208, "grad_norm": 0.4233785441775157, "learning_rate": 4.78239725988925e-05, "loss": 0.5229, "num_tokens": 516963681.0, "step": 1975 }, { "epoch": 0.9230769230769231, "grad_norm": 0.48524455188818355, "learning_rate": 4.78090648956681e-05, "loss": 0.5176, "num_tokens": 518274401.0, "step": 1980 }, { "epoch": 0.9254079254079254, "grad_norm": 0.4955971872107986, "learning_rate": 4.779410891417107e-05, "loss": 0.517, "num_tokens": 519585121.0, "step": 1985 }, { "epoch": 0.9277389277389277, "grad_norm": 0.45326272641602827, "learning_rate": 4.777910468995447e-05, "loss": 0.525, "num_tokens": 520895841.0, "step": 1990 }, { "epoch": 0.9300699300699301, "grad_norm": 0.5154625413913212, "learning_rate": 4.7764052258686e-05, "loss": 0.5155, "num_tokens": 522206561.0, "step": 1995 }, { "epoch": 0.9324009324009324, "grad_norm": 0.5173600774002117, "learning_rate": 4.774895165614799e-05, "loss": 0.5368, "num_tokens": 523517281.0, "step": 2000 }, { "epoch": 0.9347319347319347, "grad_norm": 0.4394182635279032, "learning_rate": 4.773380291823726e-05, "loss": 0.5112, "num_tokens": 524828001.0, "step": 2005 }, { "epoch": 0.9370629370629371, "grad_norm": 0.46211117414103975, "learning_rate": 4.7718606080965064e-05, "loss": 0.5176, "num_tokens": 526138721.0, "step": 2010 }, { "epoch": 0.9393939393939394, "grad_norm": 0.44369729323375096, "learning_rate": 4.770336118045701e-05, "loss": 0.5202, "num_tokens": 527449441.0, "step": 2015 }, { "epoch": 0.9417249417249417, "grad_norm": 0.46786194576051876, "learning_rate": 4.768806825295292e-05, "loss": 0.5435, "num_tokens": 528760161.0, "step": 2020 }, { "epoch": 0.9440559440559441, "grad_norm": 0.45716691252844754, "learning_rate": 4.7672727334806844e-05, "loss": 0.5217, "num_tokens": 530065986.0, "step": 2025 }, { "epoch": 0.9463869463869464, "grad_norm": 0.4716782323029019, "learning_rate": 4.765733846248685e-05, "loss": 0.5093, "num_tokens": 531376706.0, "step": 2030 }, { "epoch": 0.9487179487179487, "grad_norm": 0.4809813652755708, "learning_rate": 4.764190167257508e-05, "loss": 0.5222, "num_tokens": 532687426.0, "step": 2035 }, { "epoch": 0.951048951048951, "grad_norm": 0.42126121038875564, "learning_rate": 4.7626417001767495e-05, "loss": 0.5105, "num_tokens": 533998146.0, "step": 2040 }, { "epoch": 0.9533799533799534, "grad_norm": 0.41522513599681043, "learning_rate": 4.7610884486873947e-05, "loss": 0.5056, "num_tokens": 535308866.0, "step": 2045 }, { "epoch": 0.9557109557109557, "grad_norm": 0.4622274108637101, "learning_rate": 4.759530416481798e-05, "loss": 0.5275, "num_tokens": 536619586.0, "step": 2050 }, { "epoch": 0.958041958041958, "grad_norm": 0.411035345439206, "learning_rate": 4.757967607263681e-05, "loss": 0.5172, "num_tokens": 537916961.0, "step": 2055 }, { "epoch": 0.9603729603729604, "grad_norm": 0.41117700376194166, "learning_rate": 4.756400024748121e-05, "loss": 0.5129, "num_tokens": 539227681.0, "step": 2060 }, { "epoch": 0.9627039627039627, "grad_norm": 0.4009672829244564, "learning_rate": 4.75482767266154e-05, "loss": 0.5233, "num_tokens": 540538401.0, "step": 2065 }, { "epoch": 0.965034965034965, "grad_norm": 0.38122118693492923, "learning_rate": 4.7532505547417e-05, "loss": 0.5142, "num_tokens": 541849121.0, "step": 2070 }, { "epoch": 0.9673659673659674, "grad_norm": 0.45914187153924546, "learning_rate": 4.7516686747376926e-05, "loss": 0.5085, "num_tokens": 543159841.0, "step": 2075 }, { "epoch": 0.9696969696969697, "grad_norm": 0.43269563864820926, "learning_rate": 4.7500820364099287e-05, "loss": 0.5108, "num_tokens": 544470561.0, "step": 2080 }, { "epoch": 0.972027972027972, "grad_norm": 0.5180648923614879, "learning_rate": 4.74849064353013e-05, "loss": 0.5101, "num_tokens": 545781281.0, "step": 2085 }, { "epoch": 0.9743589743589743, "grad_norm": 0.4167054274754852, "learning_rate": 4.746894499881322e-05, "loss": 0.5058, "num_tokens": 547092001.0, "step": 2090 }, { "epoch": 0.9766899766899767, "grad_norm": 0.4186229461075277, "learning_rate": 4.745293609257822e-05, "loss": 0.5063, "num_tokens": 548394350.0, "step": 2095 }, { "epoch": 0.9790209790209791, "grad_norm": 0.45822668262911204, "learning_rate": 4.7436879754652345e-05, "loss": 0.5252, "num_tokens": 549691697.0, "step": 2100 }, { "epoch": 0.9813519813519813, "grad_norm": 0.3764583775129133, "learning_rate": 4.742077602320437e-05, "loss": 0.5007, "num_tokens": 551002417.0, "step": 2105 }, { "epoch": 0.9836829836829837, "grad_norm": 0.39341524513084175, "learning_rate": 4.7404624936515746e-05, "loss": 0.5171, "num_tokens": 552313137.0, "step": 2110 }, { "epoch": 0.986013986013986, "grad_norm": 0.4199783806277457, "learning_rate": 4.738842653298048e-05, "loss": 0.5069, "num_tokens": 553623857.0, "step": 2115 }, { "epoch": 0.9883449883449883, "grad_norm": 0.4951470880188904, "learning_rate": 4.737218085110506e-05, "loss": 0.5139, "num_tokens": 554934577.0, "step": 2120 }, { "epoch": 0.9906759906759907, "grad_norm": 0.4660659693883058, "learning_rate": 4.73558879295084e-05, "loss": 0.5158, "num_tokens": 556245297.0, "step": 2125 }, { "epoch": 0.993006993006993, "grad_norm": 0.4274088119401731, "learning_rate": 4.733954780692165e-05, "loss": 0.5086, "num_tokens": 557556017.0, "step": 2130 }, { "epoch": 0.9953379953379954, "grad_norm": 0.3888459894818999, "learning_rate": 4.732316052218822e-05, "loss": 0.5214, "num_tokens": 558866737.0, "step": 2135 }, { "epoch": 0.9976689976689976, "grad_norm": 0.46590499158532944, "learning_rate": 4.730672611426361e-05, "loss": 0.4982, "num_tokens": 560177457.0, "step": 2140 }, { "epoch": 1.0, "grad_norm": 0.40795631542247435, "learning_rate": 4.729024462221533e-05, "loss": 0.5045, "num_tokens": 561488177.0, "step": 2145 }, { "epoch": 1.0023310023310024, "grad_norm": 0.4709448881516708, "learning_rate": 4.727371608522284e-05, "loss": 0.4741, "num_tokens": 562798897.0, "step": 2150 }, { "epoch": 1.0046620046620047, "grad_norm": 0.39954760886818896, "learning_rate": 4.725714054257742e-05, "loss": 0.4879, "num_tokens": 564109617.0, "step": 2155 }, { "epoch": 1.006993006993007, "grad_norm": 0.48984368311093457, "learning_rate": 4.724051803368209e-05, "loss": 0.4857, "num_tokens": 565420337.0, "step": 2160 }, { "epoch": 1.0093240093240092, "grad_norm": 0.46609742273249777, "learning_rate": 4.7223848598051514e-05, "loss": 0.4796, "num_tokens": 566731057.0, "step": 2165 }, { "epoch": 1.0116550116550116, "grad_norm": 0.36935474096318466, "learning_rate": 4.720713227531193e-05, "loss": 0.4696, "num_tokens": 568041777.0, "step": 2170 }, { "epoch": 1.013986013986014, "grad_norm": 0.3580596575113959, "learning_rate": 4.719036910520102e-05, "loss": 0.4624, "num_tokens": 569352497.0, "step": 2175 }, { "epoch": 1.0163170163170163, "grad_norm": 0.39328253702466387, "learning_rate": 4.717355912756783e-05, "loss": 0.4874, "num_tokens": 570663217.0, "step": 2180 }, { "epoch": 1.0186480186480187, "grad_norm": 0.43854519615893484, "learning_rate": 4.715670238237267e-05, "loss": 0.4921, "num_tokens": 571946596.0, "step": 2185 }, { "epoch": 1.020979020979021, "grad_norm": 0.4320886782336019, "learning_rate": 4.713979890968704e-05, "loss": 0.4726, "num_tokens": 573254295.0, "step": 2190 }, { "epoch": 1.0233100233100234, "grad_norm": 0.417052323523323, "learning_rate": 4.712284874969351e-05, "loss": 0.4761, "num_tokens": 574555927.0, "step": 2195 }, { "epoch": 1.0256410256410255, "grad_norm": 0.39493641603163543, "learning_rate": 4.710585194268564e-05, "loss": 0.4708, "num_tokens": 575866647.0, "step": 2200 }, { "epoch": 1.027972027972028, "grad_norm": 0.43644055780187835, "learning_rate": 4.708880852906786e-05, "loss": 0.4811, "num_tokens": 577177367.0, "step": 2205 }, { "epoch": 1.0303030303030303, "grad_norm": 0.44168949675712726, "learning_rate": 4.707171854935542e-05, "loss": 0.487, "num_tokens": 578488087.0, "step": 2210 }, { "epoch": 1.0326340326340326, "grad_norm": 0.4723407127116691, "learning_rate": 4.705458204417426e-05, "loss": 0.4752, "num_tokens": 579798807.0, "step": 2215 }, { "epoch": 1.034965034965035, "grad_norm": 0.5001904118319063, "learning_rate": 4.703739905426089e-05, "loss": 0.4641, "num_tokens": 581109527.0, "step": 2220 }, { "epoch": 1.0372960372960374, "grad_norm": 0.41134533047549643, "learning_rate": 4.7020169620462363e-05, "loss": 0.4888, "num_tokens": 582420247.0, "step": 2225 }, { "epoch": 1.0396270396270397, "grad_norm": 0.4299285336564839, "learning_rate": 4.7002893783736104e-05, "loss": 0.4663, "num_tokens": 583730967.0, "step": 2230 }, { "epoch": 1.0419580419580419, "grad_norm": 0.39597875355379963, "learning_rate": 4.6985571585149876e-05, "loss": 0.4913, "num_tokens": 585031618.0, "step": 2235 }, { "epoch": 1.0442890442890442, "grad_norm": 0.5200351424675955, "learning_rate": 4.696820306588162e-05, "loss": 0.4696, "num_tokens": 586342338.0, "step": 2240 }, { "epoch": 1.0466200466200466, "grad_norm": 0.45179809404395227, "learning_rate": 4.6950788267219425e-05, "loss": 0.479, "num_tokens": 587653058.0, "step": 2245 }, { "epoch": 1.048951048951049, "grad_norm": 0.8707060578758659, "learning_rate": 4.6933327230561366e-05, "loss": 0.4666, "num_tokens": 588963778.0, "step": 2250 }, { "epoch": 1.0512820512820513, "grad_norm": 0.4615963497950469, "learning_rate": 4.691581999741544e-05, "loss": 0.477, "num_tokens": 590274498.0, "step": 2255 }, { "epoch": 1.0536130536130537, "grad_norm": 0.44042568334028054, "learning_rate": 4.689826660939947e-05, "loss": 0.4835, "num_tokens": 591579372.0, "step": 2260 }, { "epoch": 1.055944055944056, "grad_norm": 0.4171686514533797, "learning_rate": 4.6880667108241e-05, "loss": 0.4755, "num_tokens": 592882201.0, "step": 2265 }, { "epoch": 1.0582750582750582, "grad_norm": 0.3747338047053368, "learning_rate": 4.686302153577717e-05, "loss": 0.4797, "num_tokens": 594192921.0, "step": 2270 }, { "epoch": 1.0606060606060606, "grad_norm": 0.45499821388190786, "learning_rate": 4.6845329933954685e-05, "loss": 0.4933, "num_tokens": 595488293.0, "step": 2275 }, { "epoch": 1.062937062937063, "grad_norm": 0.4457910657080489, "learning_rate": 4.682759234482961e-05, "loss": 0.4812, "num_tokens": 596799013.0, "step": 2280 }, { "epoch": 1.0652680652680653, "grad_norm": 0.48446830788970874, "learning_rate": 4.680980881056736e-05, "loss": 0.4807, "num_tokens": 598097884.0, "step": 2285 }, { "epoch": 1.0675990675990676, "grad_norm": 0.39552772687416615, "learning_rate": 4.6791979373442594e-05, "loss": 0.4788, "num_tokens": 599408604.0, "step": 2290 }, { "epoch": 1.06993006993007, "grad_norm": 0.4456624889263455, "learning_rate": 4.6774104075839055e-05, "loss": 0.4652, "num_tokens": 600719324.0, "step": 2295 }, { "epoch": 1.0722610722610724, "grad_norm": 0.4907163642978358, "learning_rate": 4.6756182960249514e-05, "loss": 0.4881, "num_tokens": 602030044.0, "step": 2300 }, { "epoch": 1.0745920745920745, "grad_norm": 0.4552923449742935, "learning_rate": 4.6738216069275656e-05, "loss": 0.4767, "num_tokens": 603340764.0, "step": 2305 }, { "epoch": 1.0769230769230769, "grad_norm": 0.42327123031082714, "learning_rate": 4.6720203445628006e-05, "loss": 0.4698, "num_tokens": 604651484.0, "step": 2310 }, { "epoch": 1.0792540792540792, "grad_norm": 0.42295921506423456, "learning_rate": 4.6702145132125774e-05, "loss": 0.4814, "num_tokens": 605950791.0, "step": 2315 }, { "epoch": 1.0815850815850816, "grad_norm": 0.4362001736140205, "learning_rate": 4.668404117169679e-05, "loss": 0.4859, "num_tokens": 607261511.0, "step": 2320 }, { "epoch": 1.083916083916084, "grad_norm": 0.404472977066644, "learning_rate": 4.6665891607377415e-05, "loss": 0.4841, "num_tokens": 608572231.0, "step": 2325 }, { "epoch": 1.0862470862470863, "grad_norm": 0.4154110483833513, "learning_rate": 4.664769648231239e-05, "loss": 0.4737, "num_tokens": 609882951.0, "step": 2330 }, { "epoch": 1.0885780885780885, "grad_norm": 0.4046042178116509, "learning_rate": 4.662945583975478e-05, "loss": 0.4874, "num_tokens": 611193671.0, "step": 2335 }, { "epoch": 1.0909090909090908, "grad_norm": 0.6093115629231195, "learning_rate": 4.6611169723065854e-05, "loss": 0.4522, "num_tokens": 612504391.0, "step": 2340 }, { "epoch": 1.0932400932400932, "grad_norm": 0.5011935199868325, "learning_rate": 4.659283817571496e-05, "loss": 0.4816, "num_tokens": 613815111.0, "step": 2345 }, { "epoch": 1.0955710955710956, "grad_norm": 0.4169395778521336, "learning_rate": 4.657446124127948e-05, "loss": 0.4807, "num_tokens": 615125831.0, "step": 2350 }, { "epoch": 1.097902097902098, "grad_norm": 0.41261114151322104, "learning_rate": 4.655603896344465e-05, "loss": 0.4881, "num_tokens": 616436551.0, "step": 2355 }, { "epoch": 1.1002331002331003, "grad_norm": 0.4380543995664826, "learning_rate": 4.653757138600352e-05, "loss": 0.4654, "num_tokens": 617747271.0, "step": 2360 }, { "epoch": 1.1025641025641026, "grad_norm": 0.4436608684626528, "learning_rate": 4.651905855285682e-05, "loss": 0.4568, "num_tokens": 619057991.0, "step": 2365 }, { "epoch": 1.104895104895105, "grad_norm": 0.41612907159233714, "learning_rate": 4.650050050801285e-05, "loss": 0.479, "num_tokens": 620368711.0, "step": 2370 }, { "epoch": 1.1072261072261071, "grad_norm": 0.4396241255197849, "learning_rate": 4.64818972955874e-05, "loss": 0.4708, "num_tokens": 621679431.0, "step": 2375 }, { "epoch": 1.1095571095571095, "grad_norm": 0.4231567577088472, "learning_rate": 4.646324895980363e-05, "loss": 0.476, "num_tokens": 622990151.0, "step": 2380 }, { "epoch": 1.1118881118881119, "grad_norm": 0.37510083417666035, "learning_rate": 4.6444555544991965e-05, "loss": 0.4699, "num_tokens": 624300871.0, "step": 2385 }, { "epoch": 1.1142191142191142, "grad_norm": 0.41354134914061674, "learning_rate": 4.642581709558998e-05, "loss": 0.4922, "num_tokens": 625611591.0, "step": 2390 }, { "epoch": 1.1165501165501166, "grad_norm": 0.4506054239813869, "learning_rate": 4.640703365614233e-05, "loss": 0.4777, "num_tokens": 626922311.0, "step": 2395 }, { "epoch": 1.118881118881119, "grad_norm": 0.49541627026643736, "learning_rate": 4.6388205271300585e-05, "loss": 0.4784, "num_tokens": 628220877.0, "step": 2400 }, { "epoch": 1.121212121212121, "grad_norm": 0.4457913773922344, "learning_rate": 4.636933198582319e-05, "loss": 0.4847, "num_tokens": 629531597.0, "step": 2405 }, { "epoch": 1.1235431235431235, "grad_norm": 0.4403563514498699, "learning_rate": 4.63504138445753e-05, "loss": 0.4913, "num_tokens": 630842317.0, "step": 2410 }, { "epoch": 1.1258741258741258, "grad_norm": 0.5039870137615442, "learning_rate": 4.6331450892528725e-05, "loss": 0.4767, "num_tokens": 632153037.0, "step": 2415 }, { "epoch": 1.1282051282051282, "grad_norm": 0.43430596728961307, "learning_rate": 4.631244317476179e-05, "loss": 0.4818, "num_tokens": 633463757.0, "step": 2420 }, { "epoch": 1.1305361305361306, "grad_norm": 0.4943774670951527, "learning_rate": 4.6293390736459226e-05, "loss": 0.4692, "num_tokens": 634774477.0, "step": 2425 }, { "epoch": 1.132867132867133, "grad_norm": 0.45784553779974424, "learning_rate": 4.627429362291208e-05, "loss": 0.4787, "num_tokens": 636085197.0, "step": 2430 }, { "epoch": 1.1351981351981353, "grad_norm": 0.47029067075996683, "learning_rate": 4.62551518795176e-05, "loss": 0.4722, "num_tokens": 637395917.0, "step": 2435 }, { "epoch": 1.1375291375291376, "grad_norm": 0.444979370543174, "learning_rate": 4.623596555177913e-05, "loss": 0.4613, "num_tokens": 638706637.0, "step": 2440 }, { "epoch": 1.1398601398601398, "grad_norm": 0.3767342879009302, "learning_rate": 4.621673468530599e-05, "loss": 0.4723, "num_tokens": 640017357.0, "step": 2445 }, { "epoch": 1.1421911421911422, "grad_norm": 0.4436410093654824, "learning_rate": 4.6197459325813406e-05, "loss": 0.4814, "num_tokens": 641328077.0, "step": 2450 }, { "epoch": 1.1445221445221445, "grad_norm": 0.45322324569299677, "learning_rate": 4.617813951912231e-05, "loss": 0.4648, "num_tokens": 642637532.0, "step": 2455 }, { "epoch": 1.1468531468531469, "grad_norm": 0.3889321859528221, "learning_rate": 4.6158775311159357e-05, "loss": 0.4776, "num_tokens": 643948252.0, "step": 2460 }, { "epoch": 1.1491841491841492, "grad_norm": 0.38162917158886384, "learning_rate": 4.613936674795672e-05, "loss": 0.4886, "num_tokens": 645258972.0, "step": 2465 }, { "epoch": 1.1515151515151516, "grad_norm": 0.4103235852443348, "learning_rate": 4.611991387565202e-05, "loss": 0.4854, "num_tokens": 646569692.0, "step": 2470 }, { "epoch": 1.1538461538461537, "grad_norm": 0.4463003624553352, "learning_rate": 4.6100416740488204e-05, "loss": 0.4682, "num_tokens": 647864905.0, "step": 2475 }, { "epoch": 1.156177156177156, "grad_norm": 1.1574168466987094, "learning_rate": 4.608087538881344e-05, "loss": 0.4912, "num_tokens": 649175625.0, "step": 2480 }, { "epoch": 1.1585081585081585, "grad_norm": 0.3575350970980845, "learning_rate": 4.606128986708101e-05, "loss": 0.4725, "num_tokens": 650486345.0, "step": 2485 }, { "epoch": 1.1608391608391608, "grad_norm": 0.4698297516737484, "learning_rate": 4.604166022184921e-05, "loss": 0.4818, "num_tokens": 651772267.0, "step": 2490 }, { "epoch": 1.1631701631701632, "grad_norm": 0.4518636492405959, "learning_rate": 4.602198649978119e-05, "loss": 0.4823, "num_tokens": 653082987.0, "step": 2495 }, { "epoch": 1.1655011655011656, "grad_norm": 0.4108073804989677, "learning_rate": 4.600226874764491e-05, "loss": 0.4718, "num_tokens": 654390572.0, "step": 2500 }, { "epoch": 1.167832167832168, "grad_norm": 0.42730405936868865, "learning_rate": 4.598250701231299e-05, "loss": 0.4621, "num_tokens": 655701292.0, "step": 2505 }, { "epoch": 1.1701631701631703, "grad_norm": 0.4294330624184933, "learning_rate": 4.596270134076259e-05, "loss": 0.4773, "num_tokens": 657012012.0, "step": 2510 }, { "epoch": 1.1724941724941724, "grad_norm": 0.4229790807053798, "learning_rate": 4.594285178007534e-05, "loss": 0.4889, "num_tokens": 658322732.0, "step": 2515 }, { "epoch": 1.1748251748251748, "grad_norm": 0.3704279315350088, "learning_rate": 4.592295837743719e-05, "loss": 0.4645, "num_tokens": 659633452.0, "step": 2520 }, { "epoch": 1.1771561771561772, "grad_norm": 0.4283412404900056, "learning_rate": 4.590302118013829e-05, "loss": 0.4722, "num_tokens": 660944172.0, "step": 2525 }, { "epoch": 1.1794871794871795, "grad_norm": 0.44943350837379137, "learning_rate": 4.588304023557293e-05, "loss": 0.5052, "num_tokens": 662254892.0, "step": 2530 }, { "epoch": 1.1818181818181819, "grad_norm": 0.3908539181988753, "learning_rate": 4.586301559123939e-05, "loss": 0.4688, "num_tokens": 663565612.0, "step": 2535 }, { "epoch": 1.1841491841491842, "grad_norm": 0.4160330010793307, "learning_rate": 4.5842947294739815e-05, "loss": 0.4725, "num_tokens": 664876332.0, "step": 2540 }, { "epoch": 1.1864801864801864, "grad_norm": 0.4360101739142026, "learning_rate": 4.582283539378012e-05, "loss": 0.4849, "num_tokens": 666187052.0, "step": 2545 }, { "epoch": 1.1888111888111887, "grad_norm": 0.4342183592761623, "learning_rate": 4.580267993616991e-05, "loss": 0.4825, "num_tokens": 667497772.0, "step": 2550 }, { "epoch": 1.191142191142191, "grad_norm": 0.39093089607584136, "learning_rate": 4.578248096982227e-05, "loss": 0.4577, "num_tokens": 668808492.0, "step": 2555 }, { "epoch": 1.1934731934731935, "grad_norm": 0.4265274502701168, "learning_rate": 4.576223854275378e-05, "loss": 0.4695, "num_tokens": 670119212.0, "step": 2560 }, { "epoch": 1.1958041958041958, "grad_norm": 0.5106783765028553, "learning_rate": 4.574195270308428e-05, "loss": 0.4596, "num_tokens": 671429932.0, "step": 2565 }, { "epoch": 1.1981351981351982, "grad_norm": 0.45465970639582615, "learning_rate": 4.572162349903685e-05, "loss": 0.4808, "num_tokens": 672740652.0, "step": 2570 }, { "epoch": 1.2004662004662006, "grad_norm": 0.4654417167187371, "learning_rate": 4.570125097893762e-05, "loss": 0.481, "num_tokens": 674051372.0, "step": 2575 }, { "epoch": 1.2027972027972027, "grad_norm": 0.41022644781405776, "learning_rate": 4.568083519121572e-05, "loss": 0.4741, "num_tokens": 675362092.0, "step": 2580 }, { "epoch": 1.205128205128205, "grad_norm": 0.43369798835654055, "learning_rate": 4.566037618440313e-05, "loss": 0.4842, "num_tokens": 676670859.0, "step": 2585 }, { "epoch": 1.2074592074592074, "grad_norm": 0.5470112563703007, "learning_rate": 4.563987400713454e-05, "loss": 0.4745, "num_tokens": 677981579.0, "step": 2590 }, { "epoch": 1.2097902097902098, "grad_norm": 0.4175250823314187, "learning_rate": 4.561932870814729e-05, "loss": 0.4714, "num_tokens": 679282079.0, "step": 2595 }, { "epoch": 1.2121212121212122, "grad_norm": 0.38110478817150606, "learning_rate": 4.5598740336281225e-05, "loss": 0.4675, "num_tokens": 680592799.0, "step": 2600 }, { "epoch": 1.2144522144522145, "grad_norm": 0.47620665151012714, "learning_rate": 4.557810894047859e-05, "loss": 0.4964, "num_tokens": 681887165.0, "step": 2605 }, { "epoch": 1.2167832167832167, "grad_norm": 0.4049180388899848, "learning_rate": 4.555743456978388e-05, "loss": 0.4744, "num_tokens": 683197885.0, "step": 2610 }, { "epoch": 1.219114219114219, "grad_norm": 0.4199713705201553, "learning_rate": 4.553671727334378e-05, "loss": 0.4786, "num_tokens": 684508605.0, "step": 2615 }, { "epoch": 1.2214452214452214, "grad_norm": 0.39770427056801977, "learning_rate": 4.5515957100407e-05, "loss": 0.4696, "num_tokens": 685819325.0, "step": 2620 }, { "epoch": 1.2237762237762237, "grad_norm": 0.4512802814267864, "learning_rate": 4.5495154100324166e-05, "loss": 0.4872, "num_tokens": 687130045.0, "step": 2625 }, { "epoch": 1.2261072261072261, "grad_norm": 0.42479431900627285, "learning_rate": 4.547430832254773e-05, "loss": 0.4706, "num_tokens": 688440765.0, "step": 2630 }, { "epoch": 1.2284382284382285, "grad_norm": 0.43515400124583126, "learning_rate": 4.545341981663182e-05, "loss": 0.4647, "num_tokens": 689751485.0, "step": 2635 }, { "epoch": 1.2307692307692308, "grad_norm": 0.392935620732648, "learning_rate": 4.543248863223215e-05, "loss": 0.4685, "num_tokens": 691062205.0, "step": 2640 }, { "epoch": 1.2331002331002332, "grad_norm": 0.46909934828902206, "learning_rate": 4.541151481910589e-05, "loss": 0.4717, "num_tokens": 692372925.0, "step": 2645 }, { "epoch": 1.2354312354312353, "grad_norm": 0.3944491634545739, "learning_rate": 4.5390498427111525e-05, "loss": 0.4873, "num_tokens": 693683645.0, "step": 2650 }, { "epoch": 1.2377622377622377, "grad_norm": 0.3542154968237942, "learning_rate": 4.536943950620877e-05, "loss": 0.4947, "num_tokens": 694994365.0, "step": 2655 }, { "epoch": 1.24009324009324, "grad_norm": 0.43496568575132033, "learning_rate": 4.5348338106458446e-05, "loss": 0.465, "num_tokens": 696305085.0, "step": 2660 }, { "epoch": 1.2424242424242424, "grad_norm": 0.38699131626194777, "learning_rate": 4.532719427802234e-05, "loss": 0.4752, "num_tokens": 697607524.0, "step": 2665 }, { "epoch": 1.2447552447552448, "grad_norm": 0.42606969233099046, "learning_rate": 4.5306008071163105e-05, "loss": 0.4965, "num_tokens": 698918244.0, "step": 2670 }, { "epoch": 1.2470862470862472, "grad_norm": 0.46040092500555474, "learning_rate": 4.528477953624416e-05, "loss": 0.4861, "num_tokens": 700228964.0, "step": 2675 }, { "epoch": 1.2494172494172493, "grad_norm": 0.40529545375633014, "learning_rate": 4.526350872372949e-05, "loss": 0.4576, "num_tokens": 701539684.0, "step": 2680 }, { "epoch": 1.2517482517482517, "grad_norm": 0.4181283911639419, "learning_rate": 4.524219568418364e-05, "loss": 0.4711, "num_tokens": 702850404.0, "step": 2685 }, { "epoch": 1.254079254079254, "grad_norm": 0.39553338610588684, "learning_rate": 4.522084046827148e-05, "loss": 0.476, "num_tokens": 704161124.0, "step": 2690 }, { "epoch": 1.2564102564102564, "grad_norm": 0.37450359234529307, "learning_rate": 4.51994431267582e-05, "loss": 0.4639, "num_tokens": 705462856.0, "step": 2695 }, { "epoch": 1.2587412587412588, "grad_norm": 0.4094194955768158, "learning_rate": 4.5178003710509087e-05, "loss": 0.4825, "num_tokens": 706773576.0, "step": 2700 }, { "epoch": 1.2610722610722611, "grad_norm": 0.4885076570757472, "learning_rate": 4.515652227048946e-05, "loss": 0.4737, "num_tokens": 708084296.0, "step": 2705 }, { "epoch": 1.2634032634032635, "grad_norm": 0.49289407887729214, "learning_rate": 4.513499885776453e-05, "loss": 0.4757, "num_tokens": 709395016.0, "step": 2710 }, { "epoch": 1.2657342657342658, "grad_norm": 0.3999863566296061, "learning_rate": 4.511343352349931e-05, "loss": 0.4839, "num_tokens": 710690433.0, "step": 2715 }, { "epoch": 1.2680652680652682, "grad_norm": 0.4305940675053385, "learning_rate": 4.5091826318958434e-05, "loss": 0.4744, "num_tokens": 712001153.0, "step": 2720 }, { "epoch": 1.2703962703962703, "grad_norm": 0.4514699940855802, "learning_rate": 4.50701772955061e-05, "loss": 0.4656, "num_tokens": 713311873.0, "step": 2725 }, { "epoch": 1.2727272727272727, "grad_norm": 0.3809637591571807, "learning_rate": 4.5048486504605874e-05, "loss": 0.4627, "num_tokens": 714622593.0, "step": 2730 }, { "epoch": 1.275058275058275, "grad_norm": 0.41819681647184476, "learning_rate": 4.502675399782066e-05, "loss": 0.4746, "num_tokens": 715933313.0, "step": 2735 }, { "epoch": 1.2773892773892774, "grad_norm": 0.4133255247787771, "learning_rate": 4.5004979826812505e-05, "loss": 0.4763, "num_tokens": 717244033.0, "step": 2740 }, { "epoch": 1.2797202797202798, "grad_norm": 0.4440690290427124, "learning_rate": 4.498316404334249e-05, "loss": 0.4857, "num_tokens": 718554753.0, "step": 2745 }, { "epoch": 1.282051282051282, "grad_norm": 0.41320454746022983, "learning_rate": 4.4961306699270634e-05, "loss": 0.4812, "num_tokens": 719865473.0, "step": 2750 }, { "epoch": 1.2843822843822843, "grad_norm": 0.37971285454946885, "learning_rate": 4.4939407846555734e-05, "loss": 0.4592, "num_tokens": 721176193.0, "step": 2755 }, { "epoch": 1.2867132867132867, "grad_norm": 0.38669049099636904, "learning_rate": 4.49174675372553e-05, "loss": 0.4808, "num_tokens": 722486913.0, "step": 2760 }, { "epoch": 1.289044289044289, "grad_norm": 0.38833499571078073, "learning_rate": 4.489548582352533e-05, "loss": 0.4648, "num_tokens": 723781879.0, "step": 2765 }, { "epoch": 1.2913752913752914, "grad_norm": 0.37993452738902034, "learning_rate": 4.487346275762031e-05, "loss": 0.468, "num_tokens": 725092599.0, "step": 2770 }, { "epoch": 1.2937062937062938, "grad_norm": 0.3673232976728133, "learning_rate": 4.4851398391892976e-05, "loss": 0.4648, "num_tokens": 726403319.0, "step": 2775 }, { "epoch": 1.2960372960372961, "grad_norm": 0.4116221051012755, "learning_rate": 4.482929277879428e-05, "loss": 0.4745, "num_tokens": 727707400.0, "step": 2780 }, { "epoch": 1.2983682983682985, "grad_norm": 0.4092445660610788, "learning_rate": 4.4807145970873206e-05, "loss": 0.4822, "num_tokens": 729018120.0, "step": 2785 }, { "epoch": 1.3006993006993006, "grad_norm": 0.4185448307140966, "learning_rate": 4.4784958020776665e-05, "loss": 0.4616, "num_tokens": 730328840.0, "step": 2790 }, { "epoch": 1.303030303030303, "grad_norm": 0.38479876763978926, "learning_rate": 4.476272898124938e-05, "loss": 0.4721, "num_tokens": 731639560.0, "step": 2795 }, { "epoch": 1.3053613053613053, "grad_norm": 0.46767133110126385, "learning_rate": 4.474045890513374e-05, "loss": 0.4752, "num_tokens": 732950280.0, "step": 2800 }, { "epoch": 1.3076923076923077, "grad_norm": 0.411649743246021, "learning_rate": 4.4718147845369696e-05, "loss": 0.4573, "num_tokens": 734261000.0, "step": 2805 }, { "epoch": 1.31002331002331, "grad_norm": 0.4316901135425129, "learning_rate": 4.469579585499463e-05, "loss": 0.4783, "num_tokens": 735564553.0, "step": 2810 }, { "epoch": 1.3123543123543124, "grad_norm": 0.40955334557870626, "learning_rate": 4.467340298714319e-05, "loss": 0.4883, "num_tokens": 736875273.0, "step": 2815 }, { "epoch": 1.3146853146853146, "grad_norm": 0.4080344761506793, "learning_rate": 4.4650969295047236e-05, "loss": 0.4832, "num_tokens": 738185993.0, "step": 2820 }, { "epoch": 1.317016317016317, "grad_norm": 0.4946836082141776, "learning_rate": 4.462849483203566e-05, "loss": 0.4761, "num_tokens": 739475770.0, "step": 2825 }, { "epoch": 1.3193473193473193, "grad_norm": 0.4584478564333903, "learning_rate": 4.460597965153426e-05, "loss": 0.4649, "num_tokens": 740786490.0, "step": 2830 }, { "epoch": 1.3216783216783217, "grad_norm": 0.41894692846730963, "learning_rate": 4.458342380706566e-05, "loss": 0.4809, "num_tokens": 742097210.0, "step": 2835 }, { "epoch": 1.324009324009324, "grad_norm": 0.4516243516797518, "learning_rate": 4.456082735224911e-05, "loss": 0.4703, "num_tokens": 743407930.0, "step": 2840 }, { "epoch": 1.3263403263403264, "grad_norm": 0.4254492278222537, "learning_rate": 4.4538190340800426e-05, "loss": 0.4793, "num_tokens": 744718650.0, "step": 2845 }, { "epoch": 1.3286713286713288, "grad_norm": 0.4137937243385849, "learning_rate": 4.451551282653182e-05, "loss": 0.48, "num_tokens": 746029370.0, "step": 2850 }, { "epoch": 1.3310023310023311, "grad_norm": 0.519752518142126, "learning_rate": 4.449279486335179e-05, "loss": 0.4736, "num_tokens": 747340090.0, "step": 2855 }, { "epoch": 1.3333333333333333, "grad_norm": 0.4353489605024039, "learning_rate": 4.4470036505265e-05, "loss": 0.4744, "num_tokens": 748650810.0, "step": 2860 }, { "epoch": 1.3356643356643356, "grad_norm": 0.4419316706152749, "learning_rate": 4.444723780637212e-05, "loss": 0.493, "num_tokens": 749939934.0, "step": 2865 }, { "epoch": 1.337995337995338, "grad_norm": 0.4901281137641553, "learning_rate": 4.442439882086973e-05, "loss": 0.4901, "num_tokens": 751250654.0, "step": 2870 }, { "epoch": 1.3403263403263403, "grad_norm": 0.4412679310378607, "learning_rate": 4.440151960305017e-05, "loss": 0.4725, "num_tokens": 752561374.0, "step": 2875 }, { "epoch": 1.3426573426573427, "grad_norm": 0.46469847523541835, "learning_rate": 4.437860020730144e-05, "loss": 0.4784, "num_tokens": 753872094.0, "step": 2880 }, { "epoch": 1.3449883449883449, "grad_norm": 0.4631294470988359, "learning_rate": 4.4355640688107024e-05, "loss": 0.4645, "num_tokens": 755182814.0, "step": 2885 }, { "epoch": 1.3473193473193472, "grad_norm": 0.4557090602532957, "learning_rate": 4.43326411000458e-05, "loss": 0.4713, "num_tokens": 756493534.0, "step": 2890 }, { "epoch": 1.3496503496503496, "grad_norm": 0.4386481997162464, "learning_rate": 4.4309601497791894e-05, "loss": 0.4733, "num_tokens": 757804254.0, "step": 2895 }, { "epoch": 1.351981351981352, "grad_norm": 0.4269351049606663, "learning_rate": 4.428652193611454e-05, "loss": 0.4692, "num_tokens": 759114974.0, "step": 2900 }, { "epoch": 1.3543123543123543, "grad_norm": 2.3425883033287866, "learning_rate": 4.4263402469878015e-05, "loss": 0.4567, "num_tokens": 760425694.0, "step": 2905 }, { "epoch": 1.3566433566433567, "grad_norm": 0.36072419381621385, "learning_rate": 4.424024315404137e-05, "loss": 0.4748, "num_tokens": 761736414.0, "step": 2910 }, { "epoch": 1.358974358974359, "grad_norm": 0.4249230241783985, "learning_rate": 4.421704404365847e-05, "loss": 0.4683, "num_tokens": 763047134.0, "step": 2915 }, { "epoch": 1.3613053613053614, "grad_norm": 0.4299202972649719, "learning_rate": 4.4193805193877714e-05, "loss": 0.4663, "num_tokens": 764357854.0, "step": 2920 }, { "epoch": 1.3636363636363638, "grad_norm": 0.3879041745135311, "learning_rate": 4.4170526659942015e-05, "loss": 0.4721, "num_tokens": 765660038.0, "step": 2925 }, { "epoch": 1.365967365967366, "grad_norm": 0.5484971150134953, "learning_rate": 4.414720849718859e-05, "loss": 0.4839, "num_tokens": 766970758.0, "step": 2930 }, { "epoch": 1.3682983682983683, "grad_norm": 0.40718901418820086, "learning_rate": 4.412385076104889e-05, "loss": 0.4667, "num_tokens": 768281478.0, "step": 2935 }, { "epoch": 1.3706293706293706, "grad_norm": 0.3876539740773812, "learning_rate": 4.410045350704841e-05, "loss": 0.4612, "num_tokens": 769592198.0, "step": 2940 }, { "epoch": 1.372960372960373, "grad_norm": 0.41309989970311883, "learning_rate": 4.4077016790806604e-05, "loss": 0.4705, "num_tokens": 770902918.0, "step": 2945 }, { "epoch": 1.3752913752913754, "grad_norm": 0.3744163790044172, "learning_rate": 4.405354066803673e-05, "loss": 0.4707, "num_tokens": 772213638.0, "step": 2950 }, { "epoch": 1.3776223776223775, "grad_norm": 0.480030813067916, "learning_rate": 4.403002519454573e-05, "loss": 0.489, "num_tokens": 773524358.0, "step": 2955 }, { "epoch": 1.3799533799533799, "grad_norm": 0.44969634064801967, "learning_rate": 4.400647042623407e-05, "loss": 0.4688, "num_tokens": 774835078.0, "step": 2960 }, { "epoch": 1.3822843822843822, "grad_norm": 0.4122214657313857, "learning_rate": 4.398287641909564e-05, "loss": 0.4521, "num_tokens": 776145798.0, "step": 2965 }, { "epoch": 1.3846153846153846, "grad_norm": 0.4180712738656679, "learning_rate": 4.395924322921762e-05, "loss": 0.471, "num_tokens": 777456518.0, "step": 2970 }, { "epoch": 1.386946386946387, "grad_norm": 0.49070370600152946, "learning_rate": 4.393557091278031e-05, "loss": 0.4844, "num_tokens": 778766755.0, "step": 2975 }, { "epoch": 1.3892773892773893, "grad_norm": 0.40125497971986246, "learning_rate": 4.391185952605703e-05, "loss": 0.4859, "num_tokens": 780077475.0, "step": 2980 }, { "epoch": 1.3916083916083917, "grad_norm": 0.4410310353900502, "learning_rate": 4.3888109125413984e-05, "loss": 0.4713, "num_tokens": 781383628.0, "step": 2985 }, { "epoch": 1.393939393939394, "grad_norm": 0.3981200505167726, "learning_rate": 4.3864319767310116e-05, "loss": 0.474, "num_tokens": 782694348.0, "step": 2990 }, { "epoch": 1.3962703962703964, "grad_norm": 0.4882530547316365, "learning_rate": 4.384049150829697e-05, "loss": 0.4907, "num_tokens": 784003029.0, "step": 2995 }, { "epoch": 1.3986013986013985, "grad_norm": 0.4625354000998703, "learning_rate": 4.381662440501857e-05, "loss": 0.4783, "num_tokens": 785313749.0, "step": 3000 }, { "epoch": 1.400932400932401, "grad_norm": 0.42320091867347914, "learning_rate": 4.379271851421129e-05, "loss": 0.4745, "num_tokens": 786608916.0, "step": 3005 }, { "epoch": 1.4032634032634033, "grad_norm": 0.34353265181156867, "learning_rate": 4.3768773892703696e-05, "loss": 0.4682, "num_tokens": 787919636.0, "step": 3010 }, { "epoch": 1.4055944055944056, "grad_norm": 0.41229838212196757, "learning_rate": 4.374479059741643e-05, "loss": 0.4903, "num_tokens": 789230356.0, "step": 3015 }, { "epoch": 1.407925407925408, "grad_norm": 0.3828081072364993, "learning_rate": 4.372076868536206e-05, "loss": 0.4685, "num_tokens": 790541076.0, "step": 3020 }, { "epoch": 1.4102564102564101, "grad_norm": 0.3782799956569614, "learning_rate": 4.369670821364497e-05, "loss": 0.4875, "num_tokens": 791851796.0, "step": 3025 }, { "epoch": 1.4125874125874125, "grad_norm": 0.38271818179702677, "learning_rate": 4.3672609239461185e-05, "loss": 0.472, "num_tokens": 793162516.0, "step": 3030 }, { "epoch": 1.4149184149184149, "grad_norm": 0.4432182142698113, "learning_rate": 4.364847182009827e-05, "loss": 0.4536, "num_tokens": 794473236.0, "step": 3035 }, { "epoch": 1.4172494172494172, "grad_norm": 0.4163541269383781, "learning_rate": 4.362429601293519e-05, "loss": 0.4674, "num_tokens": 795783956.0, "step": 3040 }, { "epoch": 1.4195804195804196, "grad_norm": 0.3963132427363548, "learning_rate": 4.360008187544213e-05, "loss": 0.4691, "num_tokens": 797094676.0, "step": 3045 }, { "epoch": 1.421911421911422, "grad_norm": 0.376804595278115, "learning_rate": 4.357582946518045e-05, "loss": 0.4638, "num_tokens": 798392983.0, "step": 3050 }, { "epoch": 1.4242424242424243, "grad_norm": 0.3809783523823445, "learning_rate": 4.355153883980243e-05, "loss": 0.4779, "num_tokens": 799693790.0, "step": 3055 }, { "epoch": 1.4265734265734267, "grad_norm": 0.3621884672482386, "learning_rate": 4.3527210057051246e-05, "loss": 0.4808, "num_tokens": 801004510.0, "step": 3060 }, { "epoch": 1.428904428904429, "grad_norm": 0.4816211455166946, "learning_rate": 4.3502843174760736e-05, "loss": 0.4627, "num_tokens": 802315230.0, "step": 3065 }, { "epoch": 1.4312354312354312, "grad_norm": 0.3686667678553204, "learning_rate": 4.3478438250855344e-05, "loss": 0.4781, "num_tokens": 803625950.0, "step": 3070 }, { "epoch": 1.4335664335664335, "grad_norm": 0.39139268593103854, "learning_rate": 4.345399534334993e-05, "loss": 0.4614, "num_tokens": 804936670.0, "step": 3075 }, { "epoch": 1.435897435897436, "grad_norm": 0.3682813822971589, "learning_rate": 4.3429514510349636e-05, "loss": 0.4698, "num_tokens": 806247390.0, "step": 3080 }, { "epoch": 1.4382284382284383, "grad_norm": 0.39577744941721116, "learning_rate": 4.340499581004979e-05, "loss": 0.4696, "num_tokens": 807558110.0, "step": 3085 }, { "epoch": 1.4405594405594406, "grad_norm": 0.39221995923827324, "learning_rate": 4.33804393007357e-05, "loss": 0.4575, "num_tokens": 808854280.0, "step": 3090 }, { "epoch": 1.4428904428904428, "grad_norm": 0.3469876104551282, "learning_rate": 4.335584504078258e-05, "loss": 0.4663, "num_tokens": 810160671.0, "step": 3095 }, { "epoch": 1.4452214452214451, "grad_norm": 0.38039731278638117, "learning_rate": 4.333121308865539e-05, "loss": 0.4656, "num_tokens": 811471391.0, "step": 3100 }, { "epoch": 1.4475524475524475, "grad_norm": 0.3871641009392114, "learning_rate": 4.330654350290866e-05, "loss": 0.4741, "num_tokens": 812782111.0, "step": 3105 }, { "epoch": 1.4498834498834499, "grad_norm": 0.3723812044477465, "learning_rate": 4.328183634218641e-05, "loss": 0.4616, "num_tokens": 814092831.0, "step": 3110 }, { "epoch": 1.4522144522144522, "grad_norm": 0.4159411274519776, "learning_rate": 4.325709166522196e-05, "loss": 0.4705, "num_tokens": 815403551.0, "step": 3115 }, { "epoch": 1.4545454545454546, "grad_norm": 0.3786619070365938, "learning_rate": 4.3232309530837826e-05, "loss": 0.4702, "num_tokens": 816714271.0, "step": 3120 }, { "epoch": 1.456876456876457, "grad_norm": 0.3629693685192925, "learning_rate": 4.320748999794558e-05, "loss": 0.4623, "num_tokens": 818024991.0, "step": 3125 }, { "epoch": 1.4592074592074593, "grad_norm": 0.3807642152047702, "learning_rate": 4.3182633125545664e-05, "loss": 0.4826, "num_tokens": 819335711.0, "step": 3130 }, { "epoch": 1.4615384615384617, "grad_norm": 0.36345106221694695, "learning_rate": 4.3157738972727316e-05, "loss": 0.4749, "num_tokens": 820646431.0, "step": 3135 }, { "epoch": 1.4638694638694638, "grad_norm": 0.3982169584509918, "learning_rate": 4.3132807598668366e-05, "loss": 0.4592, "num_tokens": 821957151.0, "step": 3140 }, { "epoch": 1.4662004662004662, "grad_norm": 0.3578005945222367, "learning_rate": 4.310783906263515e-05, "loss": 0.472, "num_tokens": 823267871.0, "step": 3145 }, { "epoch": 1.4685314685314685, "grad_norm": 0.44584910654927556, "learning_rate": 4.3082833423982346e-05, "loss": 0.4682, "num_tokens": 824556476.0, "step": 3150 }, { "epoch": 1.470862470862471, "grad_norm": 0.439322244082669, "learning_rate": 4.3057790742152785e-05, "loss": 0.4572, "num_tokens": 825867196.0, "step": 3155 }, { "epoch": 1.4731934731934733, "grad_norm": 0.3890285611037925, "learning_rate": 4.3032711076677436e-05, "loss": 0.4684, "num_tokens": 827177916.0, "step": 3160 }, { "epoch": 1.4755244755244754, "grad_norm": 0.4262221846633365, "learning_rate": 4.3007594487175114e-05, "loss": 0.4748, "num_tokens": 828488636.0, "step": 3165 }, { "epoch": 1.4778554778554778, "grad_norm": 0.3770399690427187, "learning_rate": 4.298244103335244e-05, "loss": 0.4597, "num_tokens": 829799356.0, "step": 3170 }, { "epoch": 1.4801864801864801, "grad_norm": 0.3524048177654436, "learning_rate": 4.2957250775003664e-05, "loss": 0.4814, "num_tokens": 831095535.0, "step": 3175 }, { "epoch": 1.4825174825174825, "grad_norm": 0.4087741944757923, "learning_rate": 4.293202377201053e-05, "loss": 0.4892, "num_tokens": 832394592.0, "step": 3180 }, { "epoch": 1.4848484848484849, "grad_norm": 0.3875644282355541, "learning_rate": 4.290676008434214e-05, "loss": 0.4817, "num_tokens": 833705312.0, "step": 3185 }, { "epoch": 1.4871794871794872, "grad_norm": 0.41275819372925243, "learning_rate": 4.2881459772054764e-05, "loss": 0.4705, "num_tokens": 835016032.0, "step": 3190 }, { "epoch": 1.4895104895104896, "grad_norm": 0.3565273655614255, "learning_rate": 4.2856122895291767e-05, "loss": 0.4539, "num_tokens": 836326752.0, "step": 3195 }, { "epoch": 1.491841491841492, "grad_norm": 0.4013967833750652, "learning_rate": 4.2830749514283444e-05, "loss": 0.471, "num_tokens": 837637472.0, "step": 3200 }, { "epoch": 1.494172494172494, "grad_norm": 0.44988006189556307, "learning_rate": 4.280533968934683e-05, "loss": 0.4737, "num_tokens": 838948192.0, "step": 3205 }, { "epoch": 1.4965034965034965, "grad_norm": 0.4128029879161445, "learning_rate": 4.277989348088564e-05, "loss": 0.4618, "num_tokens": 840258912.0, "step": 3210 }, { "epoch": 1.4988344988344988, "grad_norm": 0.3860230229659164, "learning_rate": 4.275441094939002e-05, "loss": 0.4772, "num_tokens": 841569632.0, "step": 3215 }, { "epoch": 1.5011655011655012, "grad_norm": 0.38330430061235765, "learning_rate": 4.2728892155436524e-05, "loss": 0.4655, "num_tokens": 842875786.0, "step": 3220 }, { "epoch": 1.5034965034965035, "grad_norm": 0.3583158818403601, "learning_rate": 4.270333715968787e-05, "loss": 0.4637, "num_tokens": 844186506.0, "step": 3225 }, { "epoch": 1.5058275058275057, "grad_norm": 0.4292683787844713, "learning_rate": 4.267774602289285e-05, "loss": 0.4513, "num_tokens": 845490179.0, "step": 3230 }, { "epoch": 1.508158508158508, "grad_norm": 0.4561169755057234, "learning_rate": 4.265211880588617e-05, "loss": 0.4575, "num_tokens": 846800899.0, "step": 3235 }, { "epoch": 1.5104895104895104, "grad_norm": 0.38223276760013997, "learning_rate": 4.2626455569588274e-05, "loss": 0.4591, "num_tokens": 848111619.0, "step": 3240 }, { "epoch": 1.5128205128205128, "grad_norm": 0.3989737822244612, "learning_rate": 4.260075637500528e-05, "loss": 0.4791, "num_tokens": 849422339.0, "step": 3245 }, { "epoch": 1.5151515151515151, "grad_norm": 0.39448654246499554, "learning_rate": 4.257502128322875e-05, "loss": 0.4697, "num_tokens": 850733059.0, "step": 3250 }, { "epoch": 1.5174825174825175, "grad_norm": 0.38135346811109977, "learning_rate": 4.25492503554356e-05, "loss": 0.4858, "num_tokens": 852033301.0, "step": 3255 }, { "epoch": 1.5198135198135199, "grad_norm": 0.4063492887591379, "learning_rate": 4.252344365288791e-05, "loss": 0.4558, "num_tokens": 853344021.0, "step": 3260 }, { "epoch": 1.5221445221445222, "grad_norm": 0.41694831501332164, "learning_rate": 4.2497601236932836e-05, "loss": 0.4695, "num_tokens": 854654741.0, "step": 3265 }, { "epoch": 1.5244755244755246, "grad_norm": 0.36492369915112965, "learning_rate": 4.2471723169002404e-05, "loss": 0.4656, "num_tokens": 855965461.0, "step": 3270 }, { "epoch": 1.526806526806527, "grad_norm": 0.4768438850647179, "learning_rate": 4.244580951061341e-05, "loss": 0.4628, "num_tokens": 857276181.0, "step": 3275 }, { "epoch": 1.529137529137529, "grad_norm": 0.364684932951632, "learning_rate": 4.2419860323367236e-05, "loss": 0.4789, "num_tokens": 858586901.0, "step": 3280 }, { "epoch": 1.5314685314685315, "grad_norm": 0.3723413634564425, "learning_rate": 4.239387566894973e-05, "loss": 0.4852, "num_tokens": 859897621.0, "step": 3285 }, { "epoch": 1.5337995337995338, "grad_norm": 0.4225249222913032, "learning_rate": 4.2367855609131074e-05, "loss": 0.479, "num_tokens": 861194968.0, "step": 3290 }, { "epoch": 1.5361305361305362, "grad_norm": 0.37715348099690094, "learning_rate": 4.234180020576556e-05, "loss": 0.4849, "num_tokens": 862505688.0, "step": 3295 }, { "epoch": 1.5384615384615383, "grad_norm": 0.38467493999417174, "learning_rate": 4.231570952079157e-05, "loss": 0.4664, "num_tokens": 863814518.0, "step": 3300 }, { "epoch": 1.5407925407925407, "grad_norm": 0.37660805921200846, "learning_rate": 4.22895836162313e-05, "loss": 0.4735, "num_tokens": 865125238.0, "step": 3305 }, { "epoch": 1.543123543123543, "grad_norm": 0.3576120232900732, "learning_rate": 4.226342255419069e-05, "loss": 0.4836, "num_tokens": 866435958.0, "step": 3310 }, { "epoch": 1.5454545454545454, "grad_norm": 0.35132235584289356, "learning_rate": 4.2237226396859256e-05, "loss": 0.4482, "num_tokens": 867746678.0, "step": 3315 }, { "epoch": 1.5477855477855478, "grad_norm": 0.44192479046805605, "learning_rate": 4.2210995206509945e-05, "loss": 0.4741, "num_tokens": 869057398.0, "step": 3320 }, { "epoch": 1.5501165501165501, "grad_norm": 0.46240430648199005, "learning_rate": 4.218472904549897e-05, "loss": 0.4685, "num_tokens": 870368118.0, "step": 3325 }, { "epoch": 1.5524475524475525, "grad_norm": 0.3991386786525242, "learning_rate": 4.215842797626569e-05, "loss": 0.4821, "num_tokens": 871678838.0, "step": 3330 }, { "epoch": 1.5547785547785549, "grad_norm": 0.4310594533956871, "learning_rate": 4.2132092061332444e-05, "loss": 0.4716, "num_tokens": 872989558.0, "step": 3335 }, { "epoch": 1.5571095571095572, "grad_norm": 0.4719206923402879, "learning_rate": 4.21057213633044e-05, "loss": 0.4636, "num_tokens": 874287357.0, "step": 3340 }, { "epoch": 1.5594405594405596, "grad_norm": 0.4174800382083453, "learning_rate": 4.207931594486941e-05, "loss": 0.4702, "num_tokens": 875598077.0, "step": 3345 }, { "epoch": 1.5617715617715617, "grad_norm": 0.4353294397787359, "learning_rate": 4.205287586879788e-05, "loss": 0.4731, "num_tokens": 876908797.0, "step": 3350 }, { "epoch": 1.564102564102564, "grad_norm": 0.4245486575857189, "learning_rate": 4.202640119794258e-05, "loss": 0.4897, "num_tokens": 878219517.0, "step": 3355 }, { "epoch": 1.5664335664335665, "grad_norm": 0.36829836620338285, "learning_rate": 4.1999891995238525e-05, "loss": 0.4713, "num_tokens": 879530237.0, "step": 3360 }, { "epoch": 1.5687645687645686, "grad_norm": 0.4572528057427085, "learning_rate": 4.1973348323702834e-05, "loss": 0.4839, "num_tokens": 880840957.0, "step": 3365 }, { "epoch": 1.571095571095571, "grad_norm": 0.4298611015712589, "learning_rate": 4.1946770246434554e-05, "loss": 0.4712, "num_tokens": 882151677.0, "step": 3370 }, { "epoch": 1.5734265734265733, "grad_norm": 0.4045036094575523, "learning_rate": 4.19201578266145e-05, "loss": 0.4762, "num_tokens": 883462397.0, "step": 3375 }, { "epoch": 1.5757575757575757, "grad_norm": 0.3508435597625087, "learning_rate": 4.1893511127505155e-05, "loss": 0.4771, "num_tokens": 884773117.0, "step": 3380 }, { "epoch": 1.578088578088578, "grad_norm": 0.34886439925605456, "learning_rate": 4.186683021245048e-05, "loss": 0.4667, "num_tokens": 886083837.0, "step": 3385 }, { "epoch": 1.5804195804195804, "grad_norm": 0.3763065049584587, "learning_rate": 4.1840115144875784e-05, "loss": 0.4802, "num_tokens": 887394557.0, "step": 3390 }, { "epoch": 1.5827505827505828, "grad_norm": 0.49328331391815927, "learning_rate": 4.1813365988287536e-05, "loss": 0.4842, "num_tokens": 888691868.0, "step": 3395 }, { "epoch": 1.5850815850815851, "grad_norm": 0.3986181417508742, "learning_rate": 4.178658280627326e-05, "loss": 0.484, "num_tokens": 890002588.0, "step": 3400 }, { "epoch": 1.5874125874125875, "grad_norm": 0.49931340147631254, "learning_rate": 4.175976566250136e-05, "loss": 0.484, "num_tokens": 891313308.0, "step": 3405 }, { "epoch": 1.5897435897435899, "grad_norm": 0.45433476497736364, "learning_rate": 4.173291462072098e-05, "loss": 0.4618, "num_tokens": 892617089.0, "step": 3410 }, { "epoch": 1.5920745920745922, "grad_norm": 0.384524852468567, "learning_rate": 4.170602974476184e-05, "loss": 0.468, "num_tokens": 893927809.0, "step": 3415 }, { "epoch": 1.5944055944055944, "grad_norm": 0.3807586219393433, "learning_rate": 4.167911109853407e-05, "loss": 0.4771, "num_tokens": 895238529.0, "step": 3420 }, { "epoch": 1.5967365967365967, "grad_norm": 0.4447754359804572, "learning_rate": 4.1652158746028116e-05, "loss": 0.4716, "num_tokens": 896541317.0, "step": 3425 }, { "epoch": 1.599067599067599, "grad_norm": 0.42809843952530335, "learning_rate": 4.162517275131454e-05, "loss": 0.4604, "num_tokens": 897852037.0, "step": 3430 }, { "epoch": 1.6013986013986012, "grad_norm": 0.38368322059871096, "learning_rate": 4.159815317854384e-05, "loss": 0.4722, "num_tokens": 899157306.0, "step": 3435 }, { "epoch": 1.6037296037296036, "grad_norm": 0.41349052270597325, "learning_rate": 4.157110009194639e-05, "loss": 0.4854, "num_tokens": 900455090.0, "step": 3440 }, { "epoch": 1.606060606060606, "grad_norm": 0.37303520873037516, "learning_rate": 4.15440135558322e-05, "loss": 0.4504, "num_tokens": 901765810.0, "step": 3445 }, { "epoch": 1.6083916083916083, "grad_norm": 0.35322625283306586, "learning_rate": 4.151689363459078e-05, "loss": 0.4829, "num_tokens": 903076530.0, "step": 3450 }, { "epoch": 1.6107226107226107, "grad_norm": 0.41613261144008123, "learning_rate": 4.1489740392691054e-05, "loss": 0.4642, "num_tokens": 904387250.0, "step": 3455 }, { "epoch": 1.613053613053613, "grad_norm": 0.36788455921816615, "learning_rate": 4.1462553894681115e-05, "loss": 0.4504, "num_tokens": 905697970.0, "step": 3460 }, { "epoch": 1.6153846153846154, "grad_norm": 0.3865061275286875, "learning_rate": 4.1435334205188106e-05, "loss": 0.4742, "num_tokens": 907008690.0, "step": 3465 }, { "epoch": 1.6177156177156178, "grad_norm": 0.4445705789673663, "learning_rate": 4.1408081388918114e-05, "loss": 0.4611, "num_tokens": 908319410.0, "step": 3470 }, { "epoch": 1.6200466200466201, "grad_norm": 0.3887208640873298, "learning_rate": 4.138079551065593e-05, "loss": 0.4561, "num_tokens": 909630130.0, "step": 3475 }, { "epoch": 1.6223776223776225, "grad_norm": 0.377258046210296, "learning_rate": 4.135347663526496e-05, "loss": 0.4745, "num_tokens": 910940850.0, "step": 3480 }, { "epoch": 1.6247086247086249, "grad_norm": 0.3678540515945634, "learning_rate": 4.132612482768704e-05, "loss": 0.4724, "num_tokens": 912251570.0, "step": 3485 }, { "epoch": 1.627039627039627, "grad_norm": 0.36120123975381446, "learning_rate": 4.129874015294234e-05, "loss": 0.4844, "num_tokens": 913562290.0, "step": 3490 }, { "epoch": 1.6293706293706294, "grad_norm": 0.409642194208649, "learning_rate": 4.127132267612907e-05, "loss": 0.4665, "num_tokens": 914873010.0, "step": 3495 }, { "epoch": 1.6317016317016317, "grad_norm": 0.34029196923903116, "learning_rate": 4.1243872462423485e-05, "loss": 0.4753, "num_tokens": 916183730.0, "step": 3500 }, { "epoch": 1.6340326340326339, "grad_norm": 0.407667262964334, "learning_rate": 4.121638957707965e-05, "loss": 0.4627, "num_tokens": 917494450.0, "step": 3505 }, { "epoch": 1.6363636363636362, "grad_norm": 0.35645199665718186, "learning_rate": 4.118887408542927e-05, "loss": 0.4705, "num_tokens": 918791806.0, "step": 3510 }, { "epoch": 1.6386946386946386, "grad_norm": 0.37596379435936367, "learning_rate": 4.11613260528816e-05, "loss": 0.4591, "num_tokens": 920102526.0, "step": 3515 }, { "epoch": 1.641025641025641, "grad_norm": 0.4320558743649901, "learning_rate": 4.1133745544923236e-05, "loss": 0.456, "num_tokens": 921413246.0, "step": 3520 }, { "epoch": 1.6433566433566433, "grad_norm": 0.4341288373141138, "learning_rate": 4.1106132627117956e-05, "loss": 0.4748, "num_tokens": 922715595.0, "step": 3525 }, { "epoch": 1.6456876456876457, "grad_norm": 0.41676180005636126, "learning_rate": 4.107848736510659e-05, "loss": 0.4575, "num_tokens": 924026315.0, "step": 3530 }, { "epoch": 1.648018648018648, "grad_norm": 0.40802471968176357, "learning_rate": 4.105080982460687e-05, "loss": 0.4628, "num_tokens": 925337035.0, "step": 3535 }, { "epoch": 1.6503496503496504, "grad_norm": 0.4507954091649136, "learning_rate": 4.102310007141324e-05, "loss": 0.4837, "num_tokens": 926631017.0, "step": 3540 }, { "epoch": 1.6526806526806528, "grad_norm": 0.4050589740560033, "learning_rate": 4.0995358171396747e-05, "loss": 0.4736, "num_tokens": 927941737.0, "step": 3545 }, { "epoch": 1.6550116550116551, "grad_norm": 0.42380553595370807, "learning_rate": 4.0967584190504825e-05, "loss": 0.4734, "num_tokens": 929252457.0, "step": 3550 }, { "epoch": 1.6573426573426573, "grad_norm": 0.3958904686940088, "learning_rate": 4.0939778194761196e-05, "loss": 0.488, "num_tokens": 930563177.0, "step": 3555 }, { "epoch": 1.6596736596736597, "grad_norm": 0.43394413515654184, "learning_rate": 4.091194025026567e-05, "loss": 0.4692, "num_tokens": 931860496.0, "step": 3560 }, { "epoch": 1.662004662004662, "grad_norm": 0.4589951627434292, "learning_rate": 4.0884070423194007e-05, "loss": 0.4805, "num_tokens": 933171216.0, "step": 3565 }, { "epoch": 1.6643356643356644, "grad_norm": 0.4150754775824352, "learning_rate": 4.085616877979776e-05, "loss": 0.4628, "num_tokens": 934465598.0, "step": 3570 }, { "epoch": 1.6666666666666665, "grad_norm": 0.40687005175901875, "learning_rate": 4.0828235386404124e-05, "loss": 0.4564, "num_tokens": 935774296.0, "step": 3575 }, { "epoch": 1.6689976689976689, "grad_norm": 0.3856327268150972, "learning_rate": 4.0800270309415756e-05, "loss": 0.4635, "num_tokens": 937085016.0, "step": 3580 }, { "epoch": 1.6713286713286712, "grad_norm": 0.43509202476957415, "learning_rate": 4.077227361531063e-05, "loss": 0.4708, "num_tokens": 938395736.0, "step": 3585 }, { "epoch": 1.6736596736596736, "grad_norm": 0.37887891871157525, "learning_rate": 4.07442453706419e-05, "loss": 0.4775, "num_tokens": 939706456.0, "step": 3590 }, { "epoch": 1.675990675990676, "grad_norm": 0.35092056020696705, "learning_rate": 4.07161856420377e-05, "loss": 0.4642, "num_tokens": 941017176.0, "step": 3595 }, { "epoch": 1.6783216783216783, "grad_norm": 0.3618238243762927, "learning_rate": 4.068809449620101e-05, "loss": 0.4728, "num_tokens": 942319215.0, "step": 3600 }, { "epoch": 1.6806526806526807, "grad_norm": 0.3850438304208137, "learning_rate": 4.065997199990951e-05, "loss": 0.4823, "num_tokens": 943629935.0, "step": 3605 }, { "epoch": 1.682983682983683, "grad_norm": 0.3981622077125061, "learning_rate": 4.063181822001538e-05, "loss": 0.4605, "num_tokens": 944940655.0, "step": 3610 }, { "epoch": 1.6853146853146854, "grad_norm": 0.4200393473429356, "learning_rate": 4.060363322344518e-05, "loss": 0.4816, "num_tokens": 946251375.0, "step": 3615 }, { "epoch": 1.6876456876456878, "grad_norm": 0.389339620745983, "learning_rate": 4.05754170771997e-05, "loss": 0.4714, "num_tokens": 947562095.0, "step": 3620 }, { "epoch": 1.68997668997669, "grad_norm": 0.4117742280678568, "learning_rate": 4.054716984835372e-05, "loss": 0.4695, "num_tokens": 948872815.0, "step": 3625 }, { "epoch": 1.6923076923076923, "grad_norm": 0.4245786229670795, "learning_rate": 4.051889160405598e-05, "loss": 0.4574, "num_tokens": 950183535.0, "step": 3630 }, { "epoch": 1.6946386946386947, "grad_norm": 0.4777093865988957, "learning_rate": 4.0490582411528896e-05, "loss": 0.4572, "num_tokens": 951494255.0, "step": 3635 }, { "epoch": 1.696969696969697, "grad_norm": 0.3924099164372027, "learning_rate": 4.0462242338068476e-05, "loss": 0.4651, "num_tokens": 952804975.0, "step": 3640 }, { "epoch": 1.6993006993006992, "grad_norm": 0.38987930954143907, "learning_rate": 4.0433871451044136e-05, "loss": 0.4873, "num_tokens": 954115695.0, "step": 3645 }, { "epoch": 1.7016317016317015, "grad_norm": 0.41461621362174245, "learning_rate": 4.040546981789854e-05, "loss": 0.4748, "num_tokens": 955426415.0, "step": 3650 }, { "epoch": 1.7039627039627039, "grad_norm": 0.3425572462390038, "learning_rate": 4.0377037506147436e-05, "loss": 0.4858, "num_tokens": 956737135.0, "step": 3655 }, { "epoch": 1.7062937062937062, "grad_norm": 0.35362235462359226, "learning_rate": 4.0348574583379506e-05, "loss": 0.4515, "num_tokens": 958047855.0, "step": 3660 }, { "epoch": 1.7086247086247086, "grad_norm": 0.4269439890212748, "learning_rate": 4.032008111725619e-05, "loss": 0.478, "num_tokens": 959358575.0, "step": 3665 }, { "epoch": 1.710955710955711, "grad_norm": 0.3975233814611868, "learning_rate": 4.029155717551156e-05, "loss": 0.4682, "num_tokens": 960647821.0, "step": 3670 }, { "epoch": 1.7132867132867133, "grad_norm": 0.42538111394289835, "learning_rate": 4.026300282595211e-05, "loss": 0.4821, "num_tokens": 961958541.0, "step": 3675 }, { "epoch": 1.7156177156177157, "grad_norm": 0.44368078414892886, "learning_rate": 4.023441813645662e-05, "loss": 0.4629, "num_tokens": 963269261.0, "step": 3680 }, { "epoch": 1.717948717948718, "grad_norm": 0.4096743909582884, "learning_rate": 4.0205803174975996e-05, "loss": 0.4678, "num_tokens": 964579981.0, "step": 3685 }, { "epoch": 1.7202797202797204, "grad_norm": 0.36149619103114544, "learning_rate": 4.0177158009533136e-05, "loss": 0.4661, "num_tokens": 965885806.0, "step": 3690 }, { "epoch": 1.7226107226107226, "grad_norm": 0.3883498094552971, "learning_rate": 4.014848270822268e-05, "loss": 0.4679, "num_tokens": 967191092.0, "step": 3695 }, { "epoch": 1.724941724941725, "grad_norm": 0.3874435629285386, "learning_rate": 4.011977733921096e-05, "loss": 0.4613, "num_tokens": 968501812.0, "step": 3700 }, { "epoch": 1.7272727272727273, "grad_norm": 0.40622823181165385, "learning_rate": 4.009104197073575e-05, "loss": 0.4813, "num_tokens": 969796954.0, "step": 3705 }, { "epoch": 1.7296037296037297, "grad_norm": 0.3576403196641578, "learning_rate": 4.0062276671106154e-05, "loss": 0.456, "num_tokens": 971107674.0, "step": 3710 }, { "epoch": 1.7319347319347318, "grad_norm": 0.42428688878098647, "learning_rate": 4.0033481508702425e-05, "loss": 0.4771, "num_tokens": 972418394.0, "step": 3715 }, { "epoch": 1.7342657342657342, "grad_norm": 0.398776005644926, "learning_rate": 4.00046565519758e-05, "loss": 0.4826, "num_tokens": 973729114.0, "step": 3720 }, { "epoch": 1.7365967365967365, "grad_norm": 0.3803669387697352, "learning_rate": 3.997580186944835e-05, "loss": 0.4817, "num_tokens": 975039834.0, "step": 3725 }, { "epoch": 1.7389277389277389, "grad_norm": 0.3668247169705026, "learning_rate": 3.994691752971282e-05, "loss": 0.4671, "num_tokens": 976350554.0, "step": 3730 }, { "epoch": 1.7412587412587412, "grad_norm": 0.3618934392527535, "learning_rate": 3.991800360143241e-05, "loss": 0.475, "num_tokens": 977661274.0, "step": 3735 }, { "epoch": 1.7435897435897436, "grad_norm": 0.3616956268703821, "learning_rate": 3.988906015334073e-05, "loss": 0.4595, "num_tokens": 978971994.0, "step": 3740 }, { "epoch": 1.745920745920746, "grad_norm": 0.3930597565056608, "learning_rate": 3.986008725424148e-05, "loss": 0.465, "num_tokens": 980282714.0, "step": 3745 }, { "epoch": 1.7482517482517483, "grad_norm": 0.39536789600867117, "learning_rate": 3.983108497300844e-05, "loss": 0.4701, "num_tokens": 981585502.0, "step": 3750 }, { "epoch": 1.7505827505827507, "grad_norm": 0.4099848875497283, "learning_rate": 3.9802053378585205e-05, "loss": 0.4751, "num_tokens": 982896222.0, "step": 3755 }, { "epoch": 1.752913752913753, "grad_norm": 0.3722702285436598, "learning_rate": 3.977299253998504e-05, "loss": 0.4738, "num_tokens": 984206942.0, "step": 3760 }, { "epoch": 1.7552447552447552, "grad_norm": 0.3970652311409931, "learning_rate": 3.974390252629078e-05, "loss": 0.4671, "num_tokens": 985517662.0, "step": 3765 }, { "epoch": 1.7575757575757576, "grad_norm": 0.38418584202250494, "learning_rate": 3.971478340665455e-05, "loss": 0.478, "num_tokens": 986828382.0, "step": 3770 }, { "epoch": 1.75990675990676, "grad_norm": 0.4897203405840543, "learning_rate": 3.968563525029771e-05, "loss": 0.4758, "num_tokens": 988139102.0, "step": 3775 }, { "epoch": 1.762237762237762, "grad_norm": 0.4708593559155846, "learning_rate": 3.965645812651063e-05, "loss": 0.4872, "num_tokens": 989444809.0, "step": 3780 }, { "epoch": 1.7645687645687644, "grad_norm": 0.4215417171128016, "learning_rate": 3.9627252104652535e-05, "loss": 0.4591, "num_tokens": 990753790.0, "step": 3785 }, { "epoch": 1.7668997668997668, "grad_norm": 0.40405280271269434, "learning_rate": 3.959801725415136e-05, "loss": 0.4648, "num_tokens": 992054396.0, "step": 3790 }, { "epoch": 1.7692307692307692, "grad_norm": 0.41393808447009156, "learning_rate": 3.9568753644503566e-05, "loss": 0.4587, "num_tokens": 993352731.0, "step": 3795 }, { "epoch": 1.7715617715617715, "grad_norm": 0.37732142695063003, "learning_rate": 3.9539461345273956e-05, "loss": 0.4737, "num_tokens": 994656385.0, "step": 3800 }, { "epoch": 1.7738927738927739, "grad_norm": 0.528280250373434, "learning_rate": 3.951014042609559e-05, "loss": 0.4702, "num_tokens": 995967105.0, "step": 3805 }, { "epoch": 1.7762237762237763, "grad_norm": 0.4213342625415021, "learning_rate": 3.9480790956669486e-05, "loss": 0.4791, "num_tokens": 997267208.0, "step": 3810 }, { "epoch": 1.7785547785547786, "grad_norm": 0.3698430883873798, "learning_rate": 3.9451413006764604e-05, "loss": 0.4653, "num_tokens": 998577928.0, "step": 3815 }, { "epoch": 1.780885780885781, "grad_norm": 0.414231394973144, "learning_rate": 3.942200664621756e-05, "loss": 0.4687, "num_tokens": 999888162.0, "step": 3820 }, { "epoch": 1.7832167832167833, "grad_norm": 0.4350040599172444, "learning_rate": 3.939257194493253e-05, "loss": 0.4513, "num_tokens": 1001186063.0, "step": 3825 }, { "epoch": 1.7855477855477857, "grad_norm": 0.4152803596705445, "learning_rate": 3.936310897288104e-05, "loss": 0.4562, "num_tokens": 1002496783.0, "step": 3830 }, { "epoch": 1.7878787878787878, "grad_norm": 0.41881419362638367, "learning_rate": 3.933361780010185e-05, "loss": 0.4646, "num_tokens": 1003807503.0, "step": 3835 }, { "epoch": 1.7902097902097902, "grad_norm": 0.38522642875173096, "learning_rate": 3.930409849670073e-05, "loss": 0.4596, "num_tokens": 1005118223.0, "step": 3840 }, { "epoch": 1.7925407925407926, "grad_norm": 0.34873890955647613, "learning_rate": 3.927455113285035e-05, "loss": 0.4559, "num_tokens": 1006428943.0, "step": 3845 }, { "epoch": 1.7948717948717947, "grad_norm": 0.39305628827987876, "learning_rate": 3.924497577879005e-05, "loss": 0.4647, "num_tokens": 1007739663.0, "step": 3850 }, { "epoch": 1.797202797202797, "grad_norm": 0.3854549232040684, "learning_rate": 3.9215372504825735e-05, "loss": 0.4737, "num_tokens": 1009050383.0, "step": 3855 }, { "epoch": 1.7995337995337994, "grad_norm": 0.40079016381501426, "learning_rate": 3.9185741381329664e-05, "loss": 0.4792, "num_tokens": 1010361103.0, "step": 3860 }, { "epoch": 1.8018648018648018, "grad_norm": 0.40266606077066175, "learning_rate": 3.915608247874032e-05, "loss": 0.487, "num_tokens": 1011671823.0, "step": 3865 }, { "epoch": 1.8041958041958042, "grad_norm": 0.3510993917665707, "learning_rate": 3.912639586756221e-05, "loss": 0.4514, "num_tokens": 1012982543.0, "step": 3870 }, { "epoch": 1.8065268065268065, "grad_norm": 0.3907134061685135, "learning_rate": 3.9096681618365686e-05, "loss": 0.447, "num_tokens": 1014293263.0, "step": 3875 }, { "epoch": 1.808857808857809, "grad_norm": 0.3708262682688497, "learning_rate": 3.9066939801786836e-05, "loss": 0.4765, "num_tokens": 1015592603.0, "step": 3880 }, { "epoch": 1.8111888111888113, "grad_norm": 0.44553625905509964, "learning_rate": 3.903717048852728e-05, "loss": 0.4709, "num_tokens": 1016903323.0, "step": 3885 }, { "epoch": 1.8135198135198136, "grad_norm": 0.5102907463594831, "learning_rate": 3.900737374935396e-05, "loss": 0.477, "num_tokens": 1018214043.0, "step": 3890 }, { "epoch": 1.815850815850816, "grad_norm": 0.4103843094724411, "learning_rate": 3.897754965509908e-05, "loss": 0.4557, "num_tokens": 1019524763.0, "step": 3895 }, { "epoch": 1.8181818181818183, "grad_norm": 0.3973998481652665, "learning_rate": 3.8947698276659806e-05, "loss": 0.4606, "num_tokens": 1020835483.0, "step": 3900 }, { "epoch": 1.8205128205128205, "grad_norm": 0.4071222548172448, "learning_rate": 3.8917819684998215e-05, "loss": 0.4734, "num_tokens": 1022146203.0, "step": 3905 }, { "epoch": 1.8228438228438228, "grad_norm": 0.38953013965021704, "learning_rate": 3.888791395114103e-05, "loss": 0.4481, "num_tokens": 1023456923.0, "step": 3910 }, { "epoch": 1.8251748251748252, "grad_norm": 0.34776150147238477, "learning_rate": 3.885798114617954e-05, "loss": 0.4653, "num_tokens": 1024767643.0, "step": 3915 }, { "epoch": 1.8275058275058274, "grad_norm": 0.43412541332007054, "learning_rate": 3.8828021341269363e-05, "loss": 0.4696, "num_tokens": 1026078363.0, "step": 3920 }, { "epoch": 1.8298368298368297, "grad_norm": 0.39408936405816397, "learning_rate": 3.879803460763029e-05, "loss": 0.471, "num_tokens": 1027389083.0, "step": 3925 }, { "epoch": 1.832167832167832, "grad_norm": 0.36162777389200856, "learning_rate": 3.876802101654614e-05, "loss": 0.4669, "num_tokens": 1028699803.0, "step": 3930 }, { "epoch": 1.8344988344988344, "grad_norm": 0.4060158965672199, "learning_rate": 3.87379806393646e-05, "loss": 0.4601, "num_tokens": 1030010523.0, "step": 3935 }, { "epoch": 1.8368298368298368, "grad_norm": 0.40896752020498217, "learning_rate": 3.870791354749698e-05, "loss": 0.4688, "num_tokens": 1031319258.0, "step": 3940 }, { "epoch": 1.8391608391608392, "grad_norm": 0.44359916481371037, "learning_rate": 3.867781981241814e-05, "loss": 0.4889, "num_tokens": 1032629978.0, "step": 3945 }, { "epoch": 1.8414918414918415, "grad_norm": 0.40506992750006354, "learning_rate": 3.8647699505666265e-05, "loss": 0.4477, "num_tokens": 1033940698.0, "step": 3950 }, { "epoch": 1.843822843822844, "grad_norm": 0.39121466623448464, "learning_rate": 3.861755269884269e-05, "loss": 0.462, "num_tokens": 1035251418.0, "step": 3955 }, { "epoch": 1.8461538461538463, "grad_norm": 0.3762364215121074, "learning_rate": 3.8587379463611766e-05, "loss": 0.4718, "num_tokens": 1036562138.0, "step": 3960 }, { "epoch": 1.8484848484848486, "grad_norm": 0.38276793618104127, "learning_rate": 3.855717987170065e-05, "loss": 0.4694, "num_tokens": 1037868363.0, "step": 3965 }, { "epoch": 1.8508158508158508, "grad_norm": 0.3993712479498845, "learning_rate": 3.852695399489917e-05, "loss": 0.4632, "num_tokens": 1039179083.0, "step": 3970 }, { "epoch": 1.8531468531468531, "grad_norm": 0.3653526687250586, "learning_rate": 3.849670190505963e-05, "loss": 0.458, "num_tokens": 1040489803.0, "step": 3975 }, { "epoch": 1.8554778554778555, "grad_norm": 0.35478532717038763, "learning_rate": 3.846642367409663e-05, "loss": 0.4773, "num_tokens": 1041789042.0, "step": 3980 }, { "epoch": 1.8578088578088578, "grad_norm": 0.4152595626409347, "learning_rate": 3.843611937398695e-05, "loss": 0.4734, "num_tokens": 1043099762.0, "step": 3985 }, { "epoch": 1.86013986013986, "grad_norm": 0.3697811329438798, "learning_rate": 3.840578907676933e-05, "loss": 0.4603, "num_tokens": 1044410482.0, "step": 3990 }, { "epoch": 1.8624708624708624, "grad_norm": 0.3673405078658897, "learning_rate": 3.8375432854544265e-05, "loss": 0.468, "num_tokens": 1045721202.0, "step": 3995 }, { "epoch": 1.8648018648018647, "grad_norm": 0.3642674759468013, "learning_rate": 3.834505077947395e-05, "loss": 0.4679, "num_tokens": 1047031922.0, "step": 4000 }, { "epoch": 1.867132867132867, "grad_norm": 0.39831494968982045, "learning_rate": 3.831464292378199e-05, "loss": 0.4603, "num_tokens": 1048342642.0, "step": 4005 }, { "epoch": 1.8694638694638694, "grad_norm": 0.47030492710960053, "learning_rate": 3.828420935975328e-05, "loss": 0.4718, "num_tokens": 1049647085.0, "step": 4010 }, { "epoch": 1.8717948717948718, "grad_norm": 0.34963450633338594, "learning_rate": 3.825375015973383e-05, "loss": 0.4582, "num_tokens": 1050957805.0, "step": 4015 }, { "epoch": 1.8741258741258742, "grad_norm": 0.38080688538607055, "learning_rate": 3.822326539613061e-05, "loss": 0.4686, "num_tokens": 1052268525.0, "step": 4020 }, { "epoch": 1.8764568764568765, "grad_norm": 0.3878968790543752, "learning_rate": 3.819275514141134e-05, "loss": 0.4718, "num_tokens": 1053579245.0, "step": 4025 }, { "epoch": 1.878787878787879, "grad_norm": 0.4444788783917722, "learning_rate": 3.816221946810434e-05, "loss": 0.449, "num_tokens": 1054889965.0, "step": 4030 }, { "epoch": 1.8811188811188813, "grad_norm": 0.3855436235980361, "learning_rate": 3.813165844879835e-05, "loss": 0.4663, "num_tokens": 1056200685.0, "step": 4035 }, { "epoch": 1.8834498834498834, "grad_norm": 0.41749033912855776, "learning_rate": 3.8101072156142376e-05, "loss": 0.4721, "num_tokens": 1057507296.0, "step": 4040 }, { "epoch": 1.8857808857808858, "grad_norm": 0.3325680793207379, "learning_rate": 3.8070460662845495e-05, "loss": 0.4685, "num_tokens": 1058818016.0, "step": 4045 }, { "epoch": 1.8881118881118881, "grad_norm": 0.36642497795228024, "learning_rate": 3.80398240416767e-05, "loss": 0.4513, "num_tokens": 1060123820.0, "step": 4050 }, { "epoch": 1.8904428904428905, "grad_norm": 0.3301418844159666, "learning_rate": 3.800916236546468e-05, "loss": 0.4734, "num_tokens": 1061434540.0, "step": 4055 }, { "epoch": 1.8927738927738926, "grad_norm": 0.34768445537128323, "learning_rate": 3.797847570709775e-05, "loss": 0.4723, "num_tokens": 1062745260.0, "step": 4060 }, { "epoch": 1.895104895104895, "grad_norm": 0.347789179412769, "learning_rate": 3.794776413952354e-05, "loss": 0.4626, "num_tokens": 1064055980.0, "step": 4065 }, { "epoch": 1.8974358974358974, "grad_norm": 0.3462234651976999, "learning_rate": 3.7917027735748956e-05, "loss": 0.4607, "num_tokens": 1065366509.0, "step": 4070 }, { "epoch": 1.8997668997668997, "grad_norm": 0.38708966741877066, "learning_rate": 3.788626656883991e-05, "loss": 0.4826, "num_tokens": 1066677229.0, "step": 4075 }, { "epoch": 1.902097902097902, "grad_norm": 0.38785225263682604, "learning_rate": 3.785548071192117e-05, "loss": 0.4663, "num_tokens": 1067987949.0, "step": 4080 }, { "epoch": 1.9044289044289044, "grad_norm": 0.43056969345933427, "learning_rate": 3.782467023817623e-05, "loss": 0.4647, "num_tokens": 1069298669.0, "step": 4085 }, { "epoch": 1.9067599067599068, "grad_norm": 0.3412833063088448, "learning_rate": 3.7793835220847076e-05, "loss": 0.4678, "num_tokens": 1070609389.0, "step": 4090 }, { "epoch": 1.9090909090909092, "grad_norm": 0.4184934857430221, "learning_rate": 3.776297573323406e-05, "loss": 0.474, "num_tokens": 1071920109.0, "step": 4095 }, { "epoch": 1.9114219114219115, "grad_norm": 0.44954773016733546, "learning_rate": 3.7732091848695686e-05, "loss": 0.4647, "num_tokens": 1073230829.0, "step": 4100 }, { "epoch": 1.913752913752914, "grad_norm": 0.4771060848896429, "learning_rate": 3.770118364064846e-05, "loss": 0.4743, "num_tokens": 1074541549.0, "step": 4105 }, { "epoch": 1.916083916083916, "grad_norm": 0.3945529571970546, "learning_rate": 3.767025118256672e-05, "loss": 0.4691, "num_tokens": 1075852269.0, "step": 4110 }, { "epoch": 1.9184149184149184, "grad_norm": 0.40375250793365847, "learning_rate": 3.7639294547982416e-05, "loss": 0.4699, "num_tokens": 1077160473.0, "step": 4115 }, { "epoch": 1.9207459207459208, "grad_norm": 0.3682666412768804, "learning_rate": 3.760831381048503e-05, "loss": 0.4396, "num_tokens": 1078471193.0, "step": 4120 }, { "epoch": 1.9230769230769231, "grad_norm": 0.40270380854681137, "learning_rate": 3.757730904372127e-05, "loss": 0.4655, "num_tokens": 1079781913.0, "step": 4125 }, { "epoch": 1.9254079254079253, "grad_norm": 0.4091540171139127, "learning_rate": 3.754628032139502e-05, "loss": 0.4676, "num_tokens": 1081092633.0, "step": 4130 }, { "epoch": 1.9277389277389276, "grad_norm": 0.37723565465509623, "learning_rate": 3.75152277172671e-05, "loss": 0.458, "num_tokens": 1082403353.0, "step": 4135 }, { "epoch": 1.93006993006993, "grad_norm": 0.35463116725754923, "learning_rate": 3.7484151305155066e-05, "loss": 0.4601, "num_tokens": 1083697889.0, "step": 4140 }, { "epoch": 1.9324009324009324, "grad_norm": 0.4440845165505597, "learning_rate": 3.7453051158933124e-05, "loss": 0.4635, "num_tokens": 1084995264.0, "step": 4145 }, { "epoch": 1.9347319347319347, "grad_norm": 0.3844563537675778, "learning_rate": 3.742192735253186e-05, "loss": 0.4486, "num_tokens": 1086305984.0, "step": 4150 }, { "epoch": 1.937062937062937, "grad_norm": 0.3653521354637281, "learning_rate": 3.739077995993811e-05, "loss": 0.4609, "num_tokens": 1087616704.0, "step": 4155 }, { "epoch": 1.9393939393939394, "grad_norm": 0.3338487174568724, "learning_rate": 3.735960905519482e-05, "loss": 0.4475, "num_tokens": 1088926295.0, "step": 4160 }, { "epoch": 1.9417249417249418, "grad_norm": 0.3624856781426835, "learning_rate": 3.732841471240076e-05, "loss": 0.4515, "num_tokens": 1090237015.0, "step": 4165 }, { "epoch": 1.9440559440559442, "grad_norm": 0.34674172363160854, "learning_rate": 3.729719700571046e-05, "loss": 0.4581, "num_tokens": 1091547735.0, "step": 4170 }, { "epoch": 1.9463869463869465, "grad_norm": 0.3634515054857927, "learning_rate": 3.726595600933398e-05, "loss": 0.4614, "num_tokens": 1092858455.0, "step": 4175 }, { "epoch": 1.9487179487179487, "grad_norm": 0.38834463875662345, "learning_rate": 3.7234691797536746e-05, "loss": 0.4655, "num_tokens": 1094169175.0, "step": 4180 }, { "epoch": 1.951048951048951, "grad_norm": 0.4040443266819196, "learning_rate": 3.720340444463939e-05, "loss": 0.4603, "num_tokens": 1095479895.0, "step": 4185 }, { "epoch": 1.9533799533799534, "grad_norm": 0.37150785338102815, "learning_rate": 3.7172094025017504e-05, "loss": 0.4644, "num_tokens": 1096790615.0, "step": 4190 }, { "epoch": 1.9557109557109555, "grad_norm": 0.38391418767198954, "learning_rate": 3.714076061310157e-05, "loss": 0.47, "num_tokens": 1098101335.0, "step": 4195 }, { "epoch": 1.958041958041958, "grad_norm": 0.342628715510528, "learning_rate": 3.710940428337668e-05, "loss": 0.4598, "num_tokens": 1099412055.0, "step": 4200 }, { "epoch": 1.9603729603729603, "grad_norm": 0.35615594682543533, "learning_rate": 3.7078025110382455e-05, "loss": 0.453, "num_tokens": 1100722775.0, "step": 4205 }, { "epoch": 1.9627039627039626, "grad_norm": 0.34603620652492556, "learning_rate": 3.704662316871276e-05, "loss": 0.4821, "num_tokens": 1102033495.0, "step": 4210 }, { "epoch": 1.965034965034965, "grad_norm": 0.3544349334013842, "learning_rate": 3.7015198533015633e-05, "loss": 0.4739, "num_tokens": 1103344215.0, "step": 4215 }, { "epoch": 1.9673659673659674, "grad_norm": 0.3717399478589991, "learning_rate": 3.6983751277993045e-05, "loss": 0.4683, "num_tokens": 1104654935.0, "step": 4220 }, { "epoch": 1.9696969696969697, "grad_norm": 0.3807350804066158, "learning_rate": 3.6952281478400715e-05, "loss": 0.4721, "num_tokens": 1105965655.0, "step": 4225 }, { "epoch": 1.972027972027972, "grad_norm": 0.36855589171035036, "learning_rate": 3.692078920904799e-05, "loss": 0.4701, "num_tokens": 1107267769.0, "step": 4230 }, { "epoch": 1.9743589743589745, "grad_norm": 0.31575827164633735, "learning_rate": 3.688927454479763e-05, "loss": 0.4512, "num_tokens": 1108578489.0, "step": 4235 }, { "epoch": 1.9766899766899768, "grad_norm": 0.4085486617161448, "learning_rate": 3.6857737560565584e-05, "loss": 0.4624, "num_tokens": 1109889209.0, "step": 4240 }, { "epoch": 1.9790209790209792, "grad_norm": 0.33727800612662295, "learning_rate": 3.682617833132092e-05, "loss": 0.4427, "num_tokens": 1111190573.0, "step": 4245 }, { "epoch": 1.9813519813519813, "grad_norm": 0.4098762469941274, "learning_rate": 3.679459693208555e-05, "loss": 0.4656, "num_tokens": 1112501293.0, "step": 4250 }, { "epoch": 1.9836829836829837, "grad_norm": 0.3985077104938252, "learning_rate": 3.6762993437934094e-05, "loss": 0.4585, "num_tokens": 1113812013.0, "step": 4255 }, { "epoch": 1.986013986013986, "grad_norm": 0.37492705859719844, "learning_rate": 3.673136792399371e-05, "loss": 0.4589, "num_tokens": 1115112034.0, "step": 4260 }, { "epoch": 1.9883449883449882, "grad_norm": 0.4337730376679739, "learning_rate": 3.6699720465443885e-05, "loss": 0.471, "num_tokens": 1116422754.0, "step": 4265 }, { "epoch": 1.9906759906759905, "grad_norm": 0.36853486357793097, "learning_rate": 3.6668051137516275e-05, "loss": 0.4793, "num_tokens": 1117733474.0, "step": 4270 }, { "epoch": 1.993006993006993, "grad_norm": 0.4408364026741728, "learning_rate": 3.663636001549452e-05, "loss": 0.4637, "num_tokens": 1119044194.0, "step": 4275 }, { "epoch": 1.9953379953379953, "grad_norm": 0.4221479195643412, "learning_rate": 3.660464717471408e-05, "loss": 0.4608, "num_tokens": 1120354914.0, "step": 4280 }, { "epoch": 1.9976689976689976, "grad_norm": 0.37516116056322824, "learning_rate": 3.6572912690562045e-05, "loss": 0.4605, "num_tokens": 1121665634.0, "step": 4285 }, { "epoch": 2.0, "grad_norm": 0.4487985158857049, "learning_rate": 3.654115663847694e-05, "loss": 0.4591, "num_tokens": 1122976354.0, "step": 4290 }, { "epoch": 2.0023310023310024, "grad_norm": 0.41022871529290744, "learning_rate": 3.650937909394857e-05, "loss": 0.4071, "num_tokens": 1124287074.0, "step": 4295 }, { "epoch": 2.0046620046620047, "grad_norm": 0.38634036595793075, "learning_rate": 3.6477580132517833e-05, "loss": 0.4082, "num_tokens": 1125594773.0, "step": 4300 }, { "epoch": 2.006993006993007, "grad_norm": 0.3650522398368059, "learning_rate": 3.644575982977655e-05, "loss": 0.4186, "num_tokens": 1126905493.0, "step": 4305 }, { "epoch": 2.0093240093240095, "grad_norm": 0.3719826700428957, "learning_rate": 3.641391826136724e-05, "loss": 0.4182, "num_tokens": 1128216213.0, "step": 4310 }, { "epoch": 2.011655011655012, "grad_norm": 0.36321886526329594, "learning_rate": 3.6382055502983e-05, "loss": 0.416, "num_tokens": 1129526933.0, "step": 4315 }, { "epoch": 2.013986013986014, "grad_norm": 0.38670247193489393, "learning_rate": 3.63501716303673e-05, "loss": 0.4081, "num_tokens": 1130837653.0, "step": 4320 }, { "epoch": 2.016317016317016, "grad_norm": 0.4267019144244097, "learning_rate": 3.631826671931379e-05, "loss": 0.4238, "num_tokens": 1132148373.0, "step": 4325 }, { "epoch": 2.0186480186480185, "grad_norm": 0.3862354868842951, "learning_rate": 3.628634084566615e-05, "loss": 0.4009, "num_tokens": 1133459093.0, "step": 4330 }, { "epoch": 2.020979020979021, "grad_norm": 0.3553448746301145, "learning_rate": 3.625439408531787e-05, "loss": 0.4141, "num_tokens": 1134757659.0, "step": 4335 }, { "epoch": 2.023310023310023, "grad_norm": 0.3988598508713757, "learning_rate": 3.62224265142121e-05, "loss": 0.4054, "num_tokens": 1136052195.0, "step": 4340 }, { "epoch": 2.0256410256410255, "grad_norm": 0.4052184398184166, "learning_rate": 3.6190438208341484e-05, "loss": 0.4113, "num_tokens": 1137362915.0, "step": 4345 }, { "epoch": 2.027972027972028, "grad_norm": 0.3848516871568244, "learning_rate": 3.615842924374791e-05, "loss": 0.4153, "num_tokens": 1138673635.0, "step": 4350 }, { "epoch": 2.0303030303030303, "grad_norm": 0.36445962395799664, "learning_rate": 3.6126399696522413e-05, "loss": 0.4067, "num_tokens": 1139984355.0, "step": 4355 }, { "epoch": 2.0326340326340326, "grad_norm": 0.4070066068730233, "learning_rate": 3.609434964280495e-05, "loss": 0.4114, "num_tokens": 1141285456.0, "step": 4360 }, { "epoch": 2.034965034965035, "grad_norm": 0.3864432857902367, "learning_rate": 3.6062279158784205e-05, "loss": 0.4047, "num_tokens": 1142596176.0, "step": 4365 }, { "epoch": 2.0372960372960374, "grad_norm": 0.3673852417907459, "learning_rate": 3.603018832069744e-05, "loss": 0.4178, "num_tokens": 1143906896.0, "step": 4370 }, { "epoch": 2.0396270396270397, "grad_norm": 0.37584930043751846, "learning_rate": 3.599807720483034e-05, "loss": 0.418, "num_tokens": 1145217616.0, "step": 4375 }, { "epoch": 2.041958041958042, "grad_norm": 0.36602579882002695, "learning_rate": 3.5965945887516715e-05, "loss": 0.4056, "num_tokens": 1146528336.0, "step": 4380 }, { "epoch": 2.0442890442890445, "grad_norm": 0.351202961985245, "learning_rate": 3.593379444513848e-05, "loss": 0.3902, "num_tokens": 1147839056.0, "step": 4385 }, { "epoch": 2.046620046620047, "grad_norm": 0.4105988426092805, "learning_rate": 3.590162295412533e-05, "loss": 0.3981, "num_tokens": 1149142866.0, "step": 4390 }, { "epoch": 2.0489510489510487, "grad_norm": 0.38250301137351694, "learning_rate": 3.586943149095464e-05, "loss": 0.4103, "num_tokens": 1150453586.0, "step": 4395 }, { "epoch": 2.051282051282051, "grad_norm": 0.3386603986144158, "learning_rate": 3.5837220132151286e-05, "loss": 0.4069, "num_tokens": 1151764306.0, "step": 4400 }, { "epoch": 2.0536130536130535, "grad_norm": 0.35936045689135193, "learning_rate": 3.58049889542874e-05, "loss": 0.4141, "num_tokens": 1153075026.0, "step": 4405 }, { "epoch": 2.055944055944056, "grad_norm": 0.35837033557087505, "learning_rate": 3.577273803398225e-05, "loss": 0.4302, "num_tokens": 1154385746.0, "step": 4410 }, { "epoch": 2.058275058275058, "grad_norm": 0.35713780316116617, "learning_rate": 3.574046744790203e-05, "loss": 0.4052, "num_tokens": 1155696466.0, "step": 4415 }, { "epoch": 2.0606060606060606, "grad_norm": 0.3834311499947858, "learning_rate": 3.570817727275968e-05, "loss": 0.4107, "num_tokens": 1156996487.0, "step": 4420 }, { "epoch": 2.062937062937063, "grad_norm": 0.37490633384840205, "learning_rate": 3.567586758531471e-05, "loss": 0.4154, "num_tokens": 1158307207.0, "step": 4425 }, { "epoch": 2.0652680652680653, "grad_norm": 0.3741096498424809, "learning_rate": 3.5643538462373035e-05, "loss": 0.403, "num_tokens": 1159617927.0, "step": 4430 }, { "epoch": 2.0675990675990676, "grad_norm": 0.37754550383970564, "learning_rate": 3.561118998078673e-05, "loss": 0.4057, "num_tokens": 1160928647.0, "step": 4435 }, { "epoch": 2.06993006993007, "grad_norm": 0.4251054782461779, "learning_rate": 3.55788222174539e-05, "loss": 0.4188, "num_tokens": 1162239367.0, "step": 4440 }, { "epoch": 2.0722610722610724, "grad_norm": 0.3266838827050116, "learning_rate": 3.5546435249318535e-05, "loss": 0.4088, "num_tokens": 1163550087.0, "step": 4445 }, { "epoch": 2.0745920745920747, "grad_norm": 0.37379003655649706, "learning_rate": 3.551402915337021e-05, "loss": 0.4075, "num_tokens": 1164860807.0, "step": 4450 }, { "epoch": 2.076923076923077, "grad_norm": 0.3572437557048024, "learning_rate": 3.5481604006644e-05, "loss": 0.4179, "num_tokens": 1166171527.0, "step": 4455 }, { "epoch": 2.0792540792540795, "grad_norm": 0.3859455958746963, "learning_rate": 3.544915988622028e-05, "loss": 0.4237, "num_tokens": 1167482247.0, "step": 4460 }, { "epoch": 2.0815850815850814, "grad_norm": 0.4603570748371342, "learning_rate": 3.5416696869224504e-05, "loss": 0.4286, "num_tokens": 1168792967.0, "step": 4465 }, { "epoch": 2.0839160839160837, "grad_norm": 0.41262993381270163, "learning_rate": 3.538421503282707e-05, "loss": 0.4136, "num_tokens": 1170103687.0, "step": 4470 }, { "epoch": 2.086247086247086, "grad_norm": 0.37590755119976543, "learning_rate": 3.5351714454243096e-05, "loss": 0.4251, "num_tokens": 1171414407.0, "step": 4475 }, { "epoch": 2.0885780885780885, "grad_norm": 0.3465536685672719, "learning_rate": 3.531919521073225e-05, "loss": 0.4157, "num_tokens": 1172725127.0, "step": 4480 }, { "epoch": 2.090909090909091, "grad_norm": 0.42464850126292525, "learning_rate": 3.5286657379598586e-05, "loss": 0.405, "num_tokens": 1174035847.0, "step": 4485 }, { "epoch": 2.093240093240093, "grad_norm": 0.3901689607224437, "learning_rate": 3.5254101038190345e-05, "loss": 0.4168, "num_tokens": 1175346567.0, "step": 4490 }, { "epoch": 2.0955710955710956, "grad_norm": 0.4360019158309819, "learning_rate": 3.522152626389975e-05, "loss": 0.4151, "num_tokens": 1176657287.0, "step": 4495 }, { "epoch": 2.097902097902098, "grad_norm": 0.3502067766511093, "learning_rate": 3.5188933134162865e-05, "loss": 0.4224, "num_tokens": 1177968007.0, "step": 4500 }, { "epoch": 2.1002331002331003, "grad_norm": 0.5275300365734853, "learning_rate": 3.515632172645937e-05, "loss": 0.4159, "num_tokens": 1179278727.0, "step": 4505 }, { "epoch": 2.1025641025641026, "grad_norm": 0.3789515598432233, "learning_rate": 3.51236921183124e-05, "loss": 0.4191, "num_tokens": 1180589447.0, "step": 4510 }, { "epoch": 2.104895104895105, "grad_norm": 0.365833048618729, "learning_rate": 3.509104438728837e-05, "loss": 0.4059, "num_tokens": 1181900167.0, "step": 4515 }, { "epoch": 2.1072261072261074, "grad_norm": 0.35537158917962436, "learning_rate": 3.505837861099676e-05, "loss": 0.4234, "num_tokens": 1183210887.0, "step": 4520 }, { "epoch": 2.1095571095571097, "grad_norm": 0.35631156133374037, "learning_rate": 3.5025694867089945e-05, "loss": 0.4111, "num_tokens": 1184521607.0, "step": 4525 }, { "epoch": 2.111888111888112, "grad_norm": 0.38809219895458275, "learning_rate": 3.499299323326302e-05, "loss": 0.421, "num_tokens": 1185832327.0, "step": 4530 }, { "epoch": 2.114219114219114, "grad_norm": 0.3497921359981421, "learning_rate": 3.496027378725361e-05, "loss": 0.407, "num_tokens": 1187143047.0, "step": 4535 }, { "epoch": 2.1165501165501164, "grad_norm": 0.40913669577154826, "learning_rate": 3.492753660684167e-05, "loss": 0.4033, "num_tokens": 1188453767.0, "step": 4540 }, { "epoch": 2.1188811188811187, "grad_norm": 0.39030245051407625, "learning_rate": 3.489478176984934e-05, "loss": 0.4217, "num_tokens": 1189756471.0, "step": 4545 }, { "epoch": 2.121212121212121, "grad_norm": 0.3443883610438411, "learning_rate": 3.48620093541407e-05, "loss": 0.4232, "num_tokens": 1191067191.0, "step": 4550 }, { "epoch": 2.1235431235431235, "grad_norm": 0.3841739344611504, "learning_rate": 3.482921943762163e-05, "loss": 0.4141, "num_tokens": 1192377911.0, "step": 4555 }, { "epoch": 2.125874125874126, "grad_norm": 0.3383108963469932, "learning_rate": 3.479641209823964e-05, "loss": 0.4092, "num_tokens": 1193688631.0, "step": 4560 }, { "epoch": 2.128205128205128, "grad_norm": 0.3760327145277456, "learning_rate": 3.47635874139836e-05, "loss": 0.4087, "num_tokens": 1194999351.0, "step": 4565 }, { "epoch": 2.1305361305361306, "grad_norm": 0.34503872631449223, "learning_rate": 3.473074546288366e-05, "loss": 0.4048, "num_tokens": 1196310071.0, "step": 4570 }, { "epoch": 2.132867132867133, "grad_norm": 0.3580599391213381, "learning_rate": 3.4697886323010994e-05, "loss": 0.4152, "num_tokens": 1197608942.0, "step": 4575 }, { "epoch": 2.1351981351981353, "grad_norm": 0.435723923799417, "learning_rate": 3.466501007247764e-05, "loss": 0.4286, "num_tokens": 1198919662.0, "step": 4580 }, { "epoch": 2.1375291375291376, "grad_norm": 0.40488968120953833, "learning_rate": 3.4632116789436334e-05, "loss": 0.4118, "num_tokens": 1200230382.0, "step": 4585 }, { "epoch": 2.13986013986014, "grad_norm": 0.36702045490517987, "learning_rate": 3.459920655208027e-05, "loss": 0.4118, "num_tokens": 1201528689.0, "step": 4590 }, { "epoch": 2.1421911421911424, "grad_norm": 0.36787552508826277, "learning_rate": 3.456627943864295e-05, "loss": 0.4184, "num_tokens": 1202839409.0, "step": 4595 }, { "epoch": 2.1445221445221447, "grad_norm": 0.3756630907032684, "learning_rate": 3.453333552739801e-05, "loss": 0.4053, "num_tokens": 1204150129.0, "step": 4600 }, { "epoch": 2.1468531468531467, "grad_norm": 0.3579470200059966, "learning_rate": 3.4500374896658996e-05, "loss": 0.4147, "num_tokens": 1205460849.0, "step": 4605 }, { "epoch": 2.149184149184149, "grad_norm": 0.3946329332145125, "learning_rate": 3.446739762477922e-05, "loss": 0.4207, "num_tokens": 1206761500.0, "step": 4610 }, { "epoch": 2.1515151515151514, "grad_norm": 0.3580747430804025, "learning_rate": 3.4434403790151546e-05, "loss": 0.3979, "num_tokens": 1208065281.0, "step": 4615 }, { "epoch": 2.1538461538461537, "grad_norm": 0.36202526630950443, "learning_rate": 3.44013934712082e-05, "loss": 0.4197, "num_tokens": 1209376001.0, "step": 4620 }, { "epoch": 2.156177156177156, "grad_norm": 0.3794229385793346, "learning_rate": 3.4368366746420613e-05, "loss": 0.4259, "num_tokens": 1210686721.0, "step": 4625 }, { "epoch": 2.1585081585081585, "grad_norm": 0.3356824344579626, "learning_rate": 3.4335323694299205e-05, "loss": 0.4168, "num_tokens": 1211997441.0, "step": 4630 }, { "epoch": 2.160839160839161, "grad_norm": 0.3749451829370277, "learning_rate": 3.43022643933932e-05, "loss": 0.4106, "num_tokens": 1213308161.0, "step": 4635 }, { "epoch": 2.163170163170163, "grad_norm": 0.4016684163697115, "learning_rate": 3.426918892229046e-05, "loss": 0.4098, "num_tokens": 1214618881.0, "step": 4640 }, { "epoch": 2.1655011655011656, "grad_norm": 0.4000860482831418, "learning_rate": 3.423609735961729e-05, "loss": 0.41, "num_tokens": 1215929601.0, "step": 4645 }, { "epoch": 2.167832167832168, "grad_norm": 0.4074946003687253, "learning_rate": 3.420298978403824e-05, "loss": 0.418, "num_tokens": 1217240321.0, "step": 4650 }, { "epoch": 2.1701631701631703, "grad_norm": 0.3696291402654651, "learning_rate": 3.4169866274255926e-05, "loss": 0.4149, "num_tokens": 1218544211.0, "step": 4655 }, { "epoch": 2.1724941724941726, "grad_norm": 0.38844912853019314, "learning_rate": 3.413672690901084e-05, "loss": 0.4059, "num_tokens": 1219854931.0, "step": 4660 }, { "epoch": 2.174825174825175, "grad_norm": 0.33716786832291057, "learning_rate": 3.410357176708118e-05, "loss": 0.4033, "num_tokens": 1221165651.0, "step": 4665 }, { "epoch": 2.177156177156177, "grad_norm": 0.36180594189314, "learning_rate": 3.4070400927282616e-05, "loss": 0.4134, "num_tokens": 1222476371.0, "step": 4670 }, { "epoch": 2.1794871794871793, "grad_norm": 0.34683197522568154, "learning_rate": 3.403721446846818e-05, "loss": 0.3892, "num_tokens": 1223784575.0, "step": 4675 }, { "epoch": 2.1818181818181817, "grad_norm": 0.3776362556179136, "learning_rate": 3.400401246952798e-05, "loss": 0.4259, "num_tokens": 1225095104.0, "step": 4680 }, { "epoch": 2.184149184149184, "grad_norm": 0.3518038107992216, "learning_rate": 3.397079500938913e-05, "loss": 0.4227, "num_tokens": 1226405824.0, "step": 4685 }, { "epoch": 2.1864801864801864, "grad_norm": 0.38008130799789136, "learning_rate": 3.3937562167015444e-05, "loss": 0.4192, "num_tokens": 1227716544.0, "step": 4690 }, { "epoch": 2.1888111888111887, "grad_norm": 0.410030351985677, "learning_rate": 3.3904314021407306e-05, "loss": 0.4187, "num_tokens": 1229027264.0, "step": 4695 }, { "epoch": 2.191142191142191, "grad_norm": 0.44730022868266656, "learning_rate": 3.3871050651601526e-05, "loss": 0.4035, "num_tokens": 1230337984.0, "step": 4700 }, { "epoch": 2.1934731934731935, "grad_norm": 0.4230202641947911, "learning_rate": 3.383777213667104e-05, "loss": 0.4354, "num_tokens": 1231648704.0, "step": 4705 }, { "epoch": 2.195804195804196, "grad_norm": 0.39299528264780836, "learning_rate": 3.3804478555724836e-05, "loss": 0.4189, "num_tokens": 1232959424.0, "step": 4710 }, { "epoch": 2.198135198135198, "grad_norm": 0.33696525877227246, "learning_rate": 3.3771169987907694e-05, "loss": 0.3992, "num_tokens": 1234270144.0, "step": 4715 }, { "epoch": 2.2004662004662006, "grad_norm": 0.3608578810758537, "learning_rate": 3.373784651240003e-05, "loss": 0.4138, "num_tokens": 1235580864.0, "step": 4720 }, { "epoch": 2.202797202797203, "grad_norm": 0.36875097820212976, "learning_rate": 3.370450820841769e-05, "loss": 0.4168, "num_tokens": 1236891584.0, "step": 4725 }, { "epoch": 2.2051282051282053, "grad_norm": 0.3857628385194161, "learning_rate": 3.3671155155211775e-05, "loss": 0.4126, "num_tokens": 1238197409.0, "step": 4730 }, { "epoch": 2.2074592074592077, "grad_norm": 0.36269590587156925, "learning_rate": 3.363778743206844e-05, "loss": 0.4124, "num_tokens": 1239508129.0, "step": 4735 }, { "epoch": 2.20979020979021, "grad_norm": 0.37684424909135084, "learning_rate": 3.360440511830873e-05, "loss": 0.4051, "num_tokens": 1240818849.0, "step": 4740 }, { "epoch": 2.212121212121212, "grad_norm": 0.33976488415925243, "learning_rate": 3.3571008293288366e-05, "loss": 0.4058, "num_tokens": 1242129569.0, "step": 4745 }, { "epoch": 2.2144522144522143, "grad_norm": 0.3390834642562686, "learning_rate": 3.3537597036397555e-05, "loss": 0.3954, "num_tokens": 1243440289.0, "step": 4750 }, { "epoch": 2.2167832167832167, "grad_norm": 0.3625798993977378, "learning_rate": 3.35041714270608e-05, "loss": 0.415, "num_tokens": 1244751009.0, "step": 4755 }, { "epoch": 2.219114219114219, "grad_norm": 0.3637371812561485, "learning_rate": 3.3470731544736784e-05, "loss": 0.4099, "num_tokens": 1246061729.0, "step": 4760 }, { "epoch": 2.2214452214452214, "grad_norm": 0.3469567003110868, "learning_rate": 3.3437277468918046e-05, "loss": 0.4205, "num_tokens": 1247372449.0, "step": 4765 }, { "epoch": 2.2237762237762237, "grad_norm": 0.3626483429391148, "learning_rate": 3.3403809279130904e-05, "loss": 0.4348, "num_tokens": 1248679073.0, "step": 4770 }, { "epoch": 2.226107226107226, "grad_norm": 0.34400225732618517, "learning_rate": 3.337032705493522e-05, "loss": 0.4088, "num_tokens": 1249989793.0, "step": 4775 }, { "epoch": 2.2284382284382285, "grad_norm": 0.42160458003014656, "learning_rate": 3.333683087592421e-05, "loss": 0.4182, "num_tokens": 1251300513.0, "step": 4780 }, { "epoch": 2.230769230769231, "grad_norm": 0.3628491977962132, "learning_rate": 3.3303320821724285e-05, "loss": 0.4263, "num_tokens": 1252597120.0, "step": 4785 }, { "epoch": 2.233100233100233, "grad_norm": 0.3646447806760171, "learning_rate": 3.326979697199482e-05, "loss": 0.4206, "num_tokens": 1253907840.0, "step": 4790 }, { "epoch": 2.2354312354312356, "grad_norm": 0.42668719182222326, "learning_rate": 3.323625940642797e-05, "loss": 0.4124, "num_tokens": 1255218560.0, "step": 4795 }, { "epoch": 2.237762237762238, "grad_norm": 0.38155232884871143, "learning_rate": 3.320270820474856e-05, "loss": 0.4019, "num_tokens": 1256529280.0, "step": 4800 }, { "epoch": 2.2400932400932403, "grad_norm": 0.3680246772827745, "learning_rate": 3.316914344671374e-05, "loss": 0.424, "num_tokens": 1257840000.0, "step": 4805 }, { "epoch": 2.242424242424242, "grad_norm": 0.3538991106150229, "learning_rate": 3.313556521211296e-05, "loss": 0.4171, "num_tokens": 1259150720.0, "step": 4810 }, { "epoch": 2.2447552447552446, "grad_norm": 0.3799814749588959, "learning_rate": 3.310197358076767e-05, "loss": 0.4089, "num_tokens": 1260448949.0, "step": 4815 }, { "epoch": 2.247086247086247, "grad_norm": 0.3534336331691378, "learning_rate": 3.3068368632531166e-05, "loss": 0.419, "num_tokens": 1261759669.0, "step": 4820 }, { "epoch": 2.2494172494172493, "grad_norm": 0.3948143146379037, "learning_rate": 3.303475044728842e-05, "loss": 0.4341, "num_tokens": 1263070389.0, "step": 4825 }, { "epoch": 2.2517482517482517, "grad_norm": 0.38620152575690947, "learning_rate": 3.3001119104955856e-05, "loss": 0.3993, "num_tokens": 1264381109.0, "step": 4830 }, { "epoch": 2.254079254079254, "grad_norm": 0.409856401580112, "learning_rate": 3.296747468548117e-05, "loss": 0.4284, "num_tokens": 1265691829.0, "step": 4835 }, { "epoch": 2.2564102564102564, "grad_norm": 0.3894321494198609, "learning_rate": 3.2933817268843175e-05, "loss": 0.4044, "num_tokens": 1267002549.0, "step": 4840 }, { "epoch": 2.2587412587412588, "grad_norm": 0.44554017981068295, "learning_rate": 3.2900146935051535e-05, "loss": 0.4046, "num_tokens": 1268313269.0, "step": 4845 }, { "epoch": 2.261072261072261, "grad_norm": 0.38537941934187847, "learning_rate": 3.2866463764146647e-05, "loss": 0.4088, "num_tokens": 1269611053.0, "step": 4850 }, { "epoch": 2.2634032634032635, "grad_norm": 0.34106108845050315, "learning_rate": 3.2832767836199435e-05, "loss": 0.4066, "num_tokens": 1270921773.0, "step": 4855 }, { "epoch": 2.265734265734266, "grad_norm": 0.37822769502708925, "learning_rate": 3.279905923131112e-05, "loss": 0.4352, "num_tokens": 1272232493.0, "step": 4860 }, { "epoch": 2.268065268065268, "grad_norm": 0.3717980750928343, "learning_rate": 3.276533802961308e-05, "loss": 0.4149, "num_tokens": 1273533373.0, "step": 4865 }, { "epoch": 2.2703962703962706, "grad_norm": 0.40049719958281904, "learning_rate": 3.273160431126664e-05, "loss": 0.4149, "num_tokens": 1274825773.0, "step": 4870 }, { "epoch": 2.2727272727272725, "grad_norm": 0.3492455161378825, "learning_rate": 3.269785815646286e-05, "loss": 0.4078, "num_tokens": 1276121190.0, "step": 4875 }, { "epoch": 2.2750582750582753, "grad_norm": 0.3434108328323875, "learning_rate": 3.266409964542236e-05, "loss": 0.4315, "num_tokens": 1277431910.0, "step": 4880 }, { "epoch": 2.277389277389277, "grad_norm": 0.34434993144818066, "learning_rate": 3.263032885839517e-05, "loss": 0.3986, "num_tokens": 1278729221.0, "step": 4885 }, { "epoch": 2.2797202797202796, "grad_norm": 0.32968474076252635, "learning_rate": 3.2596545875660474e-05, "loss": 0.4029, "num_tokens": 1280039941.0, "step": 4890 }, { "epoch": 2.282051282051282, "grad_norm": 0.3533987796359302, "learning_rate": 3.256275077752644e-05, "loss": 0.4132, "num_tokens": 1281350661.0, "step": 4895 }, { "epoch": 2.2843822843822843, "grad_norm": 0.36035857510185826, "learning_rate": 3.2528943644330066e-05, "loss": 0.4062, "num_tokens": 1282661381.0, "step": 4900 }, { "epoch": 2.2867132867132867, "grad_norm": 0.4102624567196318, "learning_rate": 3.2495124556436935e-05, "loss": 0.405, "num_tokens": 1283972101.0, "step": 4905 }, { "epoch": 2.289044289044289, "grad_norm": 0.32975812878362454, "learning_rate": 3.246129359424105e-05, "loss": 0.4183, "num_tokens": 1285282821.0, "step": 4910 }, { "epoch": 2.2913752913752914, "grad_norm": 0.3671230776462342, "learning_rate": 3.2427450838164665e-05, "loss": 0.4202, "num_tokens": 1286593541.0, "step": 4915 }, { "epoch": 2.2937062937062938, "grad_norm": 0.35487029443793194, "learning_rate": 3.239359636865803e-05, "loss": 0.4135, "num_tokens": 1287904261.0, "step": 4920 }, { "epoch": 2.296037296037296, "grad_norm": 0.35127658402893597, "learning_rate": 3.235973026619928e-05, "loss": 0.4119, "num_tokens": 1289214981.0, "step": 4925 }, { "epoch": 2.2983682983682985, "grad_norm": 0.34383367438253826, "learning_rate": 3.2325852611294175e-05, "loss": 0.4191, "num_tokens": 1290517020.0, "step": 4930 }, { "epoch": 2.300699300699301, "grad_norm": 0.3781283212650206, "learning_rate": 3.229196348447595e-05, "loss": 0.4133, "num_tokens": 1291814376.0, "step": 4935 }, { "epoch": 2.303030303030303, "grad_norm": 0.3797225808633411, "learning_rate": 3.225806296630512e-05, "loss": 0.4314, "num_tokens": 1293125096.0, "step": 4940 }, { "epoch": 2.3053613053613056, "grad_norm": 0.3820018986940426, "learning_rate": 3.2224151137369244e-05, "loss": 0.4089, "num_tokens": 1294422895.0, "step": 4945 }, { "epoch": 2.3076923076923075, "grad_norm": 0.3535499522376053, "learning_rate": 3.219022807828282e-05, "loss": 0.4105, "num_tokens": 1295733615.0, "step": 4950 }, { "epoch": 2.31002331002331, "grad_norm": 0.35691493546859565, "learning_rate": 3.215629386968701e-05, "loss": 0.4103, "num_tokens": 1297044335.0, "step": 4955 }, { "epoch": 2.312354312354312, "grad_norm": 0.3387110035916893, "learning_rate": 3.212234859224946e-05, "loss": 0.4284, "num_tokens": 1298355055.0, "step": 4960 }, { "epoch": 2.3146853146853146, "grad_norm": 0.3710615014269036, "learning_rate": 3.208839232666419e-05, "loss": 0.4101, "num_tokens": 1299665775.0, "step": 4965 }, { "epoch": 2.317016317016317, "grad_norm": 0.3373474545094395, "learning_rate": 3.205442515365128e-05, "loss": 0.4088, "num_tokens": 1300976495.0, "step": 4970 }, { "epoch": 2.3193473193473193, "grad_norm": 0.34943245534248407, "learning_rate": 3.202044715395677e-05, "loss": 0.4291, "num_tokens": 1302287215.0, "step": 4975 }, { "epoch": 2.3216783216783217, "grad_norm": 0.3617392320216389, "learning_rate": 3.198645840835243e-05, "loss": 0.4289, "num_tokens": 1303590183.0, "step": 4980 }, { "epoch": 2.324009324009324, "grad_norm": 0.37806058308011203, "learning_rate": 3.195245899763559e-05, "loss": 0.4104, "num_tokens": 1304900903.0, "step": 4985 }, { "epoch": 2.3263403263403264, "grad_norm": 0.3317302716295439, "learning_rate": 3.1918449002628895e-05, "loss": 0.4084, "num_tokens": 1306211623.0, "step": 4990 }, { "epoch": 2.3286713286713288, "grad_norm": 0.3435386897987483, "learning_rate": 3.1884428504180186e-05, "loss": 0.4135, "num_tokens": 1307522343.0, "step": 4995 }, { "epoch": 2.331002331002331, "grad_norm": 0.360164719000458, "learning_rate": 3.185039758316226e-05, "loss": 0.4115, "num_tokens": 1308833063.0, "step": 5000 }, { "epoch": 2.3333333333333335, "grad_norm": 0.3945377389793074, "learning_rate": 3.1816356320472695e-05, "loss": 0.4188, "num_tokens": 1310143783.0, "step": 5005 }, { "epoch": 2.335664335664336, "grad_norm": 0.4051939320818589, "learning_rate": 3.178230479703364e-05, "loss": 0.4203, "num_tokens": 1311454503.0, "step": 5010 }, { "epoch": 2.3379953379953378, "grad_norm": 0.35708148981072574, "learning_rate": 3.174824309379166e-05, "loss": 0.418, "num_tokens": 1312765223.0, "step": 5015 }, { "epoch": 2.3403263403263406, "grad_norm": 0.33742717843001446, "learning_rate": 3.1714171291717486e-05, "loss": 0.4084, "num_tokens": 1314069304.0, "step": 5020 }, { "epoch": 2.3426573426573425, "grad_norm": 0.3589509813194004, "learning_rate": 3.168008947180588e-05, "loss": 0.4045, "num_tokens": 1315380024.0, "step": 5025 }, { "epoch": 2.344988344988345, "grad_norm": 0.3508912054860665, "learning_rate": 3.1645997715075426e-05, "loss": 0.4033, "num_tokens": 1316690744.0, "step": 5030 }, { "epoch": 2.347319347319347, "grad_norm": 0.3295215331291118, "learning_rate": 3.161189610256829e-05, "loss": 0.4066, "num_tokens": 1318001464.0, "step": 5035 }, { "epoch": 2.3496503496503496, "grad_norm": 0.35230973624032774, "learning_rate": 3.157778471535011e-05, "loss": 0.417, "num_tokens": 1319312184.0, "step": 5040 }, { "epoch": 2.351981351981352, "grad_norm": 0.3884539975737978, "learning_rate": 3.154366363450974e-05, "loss": 0.4236, "num_tokens": 1320618337.0, "step": 5045 }, { "epoch": 2.3543123543123543, "grad_norm": 0.3698867601064244, "learning_rate": 3.150953294115907e-05, "loss": 0.4054, "num_tokens": 1321929057.0, "step": 5050 }, { "epoch": 2.3566433566433567, "grad_norm": 0.3652929150276016, "learning_rate": 3.147539271643287e-05, "loss": 0.4267, "num_tokens": 1323239777.0, "step": 5055 }, { "epoch": 2.358974358974359, "grad_norm": 0.35528129394096414, "learning_rate": 3.1441243041488525e-05, "loss": 0.4336, "num_tokens": 1324550497.0, "step": 5060 }, { "epoch": 2.3613053613053614, "grad_norm": 0.37876386202795675, "learning_rate": 3.140708399750594e-05, "loss": 0.425, "num_tokens": 1325861217.0, "step": 5065 }, { "epoch": 2.3636363636363638, "grad_norm": 0.35526472594618097, "learning_rate": 3.1372915665687225e-05, "loss": 0.4073, "num_tokens": 1327171937.0, "step": 5070 }, { "epoch": 2.365967365967366, "grad_norm": 0.3550032233081871, "learning_rate": 3.133873812725662e-05, "loss": 0.4078, "num_tokens": 1328482657.0, "step": 5075 }, { "epoch": 2.3682983682983685, "grad_norm": 0.3431998998307288, "learning_rate": 3.130455146346024e-05, "loss": 0.4105, "num_tokens": 1329791355.0, "step": 5080 }, { "epoch": 2.370629370629371, "grad_norm": 0.3608126350221179, "learning_rate": 3.1270355755565886e-05, "loss": 0.4262, "num_tokens": 1331102075.0, "step": 5085 }, { "epoch": 2.3729603729603728, "grad_norm": 0.3494733230503953, "learning_rate": 3.123615108486286e-05, "loss": 0.4238, "num_tokens": 1332404189.0, "step": 5090 }, { "epoch": 2.375291375291375, "grad_norm": 0.338730898909754, "learning_rate": 3.120193753266175e-05, "loss": 0.4191, "num_tokens": 1333714909.0, "step": 5095 }, { "epoch": 2.3776223776223775, "grad_norm": 0.402224903788031, "learning_rate": 3.116771518029431e-05, "loss": 0.4161, "num_tokens": 1335025629.0, "step": 5100 }, { "epoch": 2.37995337995338, "grad_norm": 0.36595610812881385, "learning_rate": 3.113348410911316e-05, "loss": 0.4081, "num_tokens": 1336336349.0, "step": 5105 }, { "epoch": 2.382284382284382, "grad_norm": 0.3880930752201236, "learning_rate": 3.109924440049166e-05, "loss": 0.4176, "num_tokens": 1337640003.0, "step": 5110 }, { "epoch": 2.3846153846153846, "grad_norm": 0.35480427992895003, "learning_rate": 3.1064996135823736e-05, "loss": 0.4143, "num_tokens": 1338950723.0, "step": 5115 }, { "epoch": 2.386946386946387, "grad_norm": 0.3439245249078291, "learning_rate": 3.10307393965236e-05, "loss": 0.4128, "num_tokens": 1340261443.0, "step": 5120 }, { "epoch": 2.3892773892773893, "grad_norm": 0.38182867678172666, "learning_rate": 3.0996474264025654e-05, "loss": 0.4112, "num_tokens": 1341564973.0, "step": 5125 }, { "epoch": 2.3916083916083917, "grad_norm": 0.3598389668576128, "learning_rate": 3.096220081978423e-05, "loss": 0.4156, "num_tokens": 1342875693.0, "step": 5130 }, { "epoch": 2.393939393939394, "grad_norm": 0.3860442296593736, "learning_rate": 3.092791914527341e-05, "loss": 0.4091, "num_tokens": 1344186413.0, "step": 5135 }, { "epoch": 2.3962703962703964, "grad_norm": 0.36500293825420654, "learning_rate": 3.0893629321986874e-05, "loss": 0.4121, "num_tokens": 1345497133.0, "step": 5140 }, { "epoch": 2.3986013986013988, "grad_norm": 0.3776001399782403, "learning_rate": 3.085933143143765e-05, "loss": 0.4306, "num_tokens": 1346794525.0, "step": 5145 }, { "epoch": 2.400932400932401, "grad_norm": 0.3760233460610802, "learning_rate": 3.082502555515793e-05, "loss": 0.4302, "num_tokens": 1348105245.0, "step": 5150 }, { "epoch": 2.403263403263403, "grad_norm": 0.37109726618618827, "learning_rate": 3.079071177469892e-05, "loss": 0.4021, "num_tokens": 1349414836.0, "step": 5155 }, { "epoch": 2.4055944055944054, "grad_norm": 0.3562308097398838, "learning_rate": 3.07563901716306e-05, "loss": 0.4198, "num_tokens": 1350725556.0, "step": 5160 }, { "epoch": 2.4079254079254078, "grad_norm": 0.37126907481875954, "learning_rate": 3.072206082754154e-05, "loss": 0.427, "num_tokens": 1352022873.0, "step": 5165 }, { "epoch": 2.41025641025641, "grad_norm": 0.3766487217257149, "learning_rate": 3.068772382403873e-05, "loss": 0.4231, "num_tokens": 1353333593.0, "step": 5170 }, { "epoch": 2.4125874125874125, "grad_norm": 0.36252558911615673, "learning_rate": 3.065337924274735e-05, "loss": 0.4113, "num_tokens": 1354644313.0, "step": 5175 }, { "epoch": 2.414918414918415, "grad_norm": 0.36974990717403, "learning_rate": 3.06190271653106e-05, "loss": 0.4289, "num_tokens": 1355955033.0, "step": 5180 }, { "epoch": 2.417249417249417, "grad_norm": 0.3383571793373514, "learning_rate": 3.058466767338951e-05, "loss": 0.4214, "num_tokens": 1357265753.0, "step": 5185 }, { "epoch": 2.4195804195804196, "grad_norm": 0.38036895795490944, "learning_rate": 3.0550300848662704e-05, "loss": 0.4207, "num_tokens": 1358576473.0, "step": 5190 }, { "epoch": 2.421911421911422, "grad_norm": 0.3505203016572691, "learning_rate": 3.051592677282628e-05, "loss": 0.4165, "num_tokens": 1359887193.0, "step": 5195 }, { "epoch": 2.4242424242424243, "grad_norm": 0.3640785191337429, "learning_rate": 3.0481545527593546e-05, "loss": 0.4272, "num_tokens": 1361197913.0, "step": 5200 }, { "epoch": 2.4265734265734267, "grad_norm": 0.35398548228202553, "learning_rate": 3.0447157194694864e-05, "loss": 0.4182, "num_tokens": 1362508633.0, "step": 5205 }, { "epoch": 2.428904428904429, "grad_norm": 0.36683505543529804, "learning_rate": 3.041276185587743e-05, "loss": 0.421, "num_tokens": 1363819353.0, "step": 5210 }, { "epoch": 2.4312354312354314, "grad_norm": 0.37273261132442753, "learning_rate": 3.0378359592905097e-05, "loss": 0.4004, "num_tokens": 1365130073.0, "step": 5215 }, { "epoch": 2.4335664335664333, "grad_norm": 0.3711753185510304, "learning_rate": 3.0343950487558208e-05, "loss": 0.4157, "num_tokens": 1366440793.0, "step": 5220 }, { "epoch": 2.435897435897436, "grad_norm": 0.3886512119876518, "learning_rate": 3.030953462163334e-05, "loss": 0.4203, "num_tokens": 1367751513.0, "step": 5225 }, { "epoch": 2.438228438228438, "grad_norm": 0.35270305999051793, "learning_rate": 3.0275112076943145e-05, "loss": 0.4039, "num_tokens": 1369051755.0, "step": 5230 }, { "epoch": 2.4405594405594404, "grad_norm": 0.3912345566826907, "learning_rate": 3.0240682935316156e-05, "loss": 0.4152, "num_tokens": 1370362475.0, "step": 5235 }, { "epoch": 2.4428904428904428, "grad_norm": 0.39812504543925764, "learning_rate": 3.0206247278596594e-05, "loss": 0.4252, "num_tokens": 1371667349.0, "step": 5240 }, { "epoch": 2.445221445221445, "grad_norm": 0.36495714706101673, "learning_rate": 3.0171805188644163e-05, "loss": 0.4262, "num_tokens": 1372978069.0, "step": 5245 }, { "epoch": 2.4475524475524475, "grad_norm": 0.32681284740431915, "learning_rate": 3.013735674733385e-05, "loss": 0.4091, "num_tokens": 1374288789.0, "step": 5250 }, { "epoch": 2.44988344988345, "grad_norm": 0.41435527791431637, "learning_rate": 3.0102902036555765e-05, "loss": 0.4153, "num_tokens": 1375599509.0, "step": 5255 }, { "epoch": 2.4522144522144522, "grad_norm": 0.347283922287271, "learning_rate": 3.0068441138214886e-05, "loss": 0.4092, "num_tokens": 1376910229.0, "step": 5260 }, { "epoch": 2.4545454545454546, "grad_norm": 0.38829537811571885, "learning_rate": 3.0033974134230937e-05, "loss": 0.4177, "num_tokens": 1378220949.0, "step": 5265 }, { "epoch": 2.456876456876457, "grad_norm": 0.3275166232456935, "learning_rate": 2.9999501106538126e-05, "loss": 0.4082, "num_tokens": 1379531669.0, "step": 5270 }, { "epoch": 2.4592074592074593, "grad_norm": 0.42514354355776585, "learning_rate": 2.9965022137084997e-05, "loss": 0.4056, "num_tokens": 1380837473.0, "step": 5275 }, { "epoch": 2.4615384615384617, "grad_norm": 0.37968747313444934, "learning_rate": 2.993053730783422e-05, "loss": 0.3987, "num_tokens": 1382148193.0, "step": 5280 }, { "epoch": 2.463869463869464, "grad_norm": 0.3560839943508509, "learning_rate": 2.9896046700762398e-05, "loss": 0.4136, "num_tokens": 1383458913.0, "step": 5285 }, { "epoch": 2.4662004662004664, "grad_norm": 0.36938187951856855, "learning_rate": 2.9861550397859838e-05, "loss": 0.4154, "num_tokens": 1384769633.0, "step": 5290 }, { "epoch": 2.4685314685314683, "grad_norm": 0.40308698065693266, "learning_rate": 2.982704848113043e-05, "loss": 0.425, "num_tokens": 1386065812.0, "step": 5295 }, { "epoch": 2.4708624708624707, "grad_norm": 0.3293971532786334, "learning_rate": 2.9792541032591387e-05, "loss": 0.4114, "num_tokens": 1387376532.0, "step": 5300 }, { "epoch": 2.473193473193473, "grad_norm": 0.3273718604110626, "learning_rate": 2.975802813427307e-05, "loss": 0.3997, "num_tokens": 1388687252.0, "step": 5305 }, { "epoch": 2.4755244755244754, "grad_norm": 0.35222934802235306, "learning_rate": 2.9723509868218792e-05, "loss": 0.4143, "num_tokens": 1389997972.0, "step": 5310 }, { "epoch": 2.4778554778554778, "grad_norm": 0.3606682109903173, "learning_rate": 2.9688986316484636e-05, "loss": 0.4158, "num_tokens": 1391308692.0, "step": 5315 }, { "epoch": 2.48018648018648, "grad_norm": 0.3851368043166362, "learning_rate": 2.9654457561139254e-05, "loss": 0.4204, "num_tokens": 1392619412.0, "step": 5320 }, { "epoch": 2.4825174825174825, "grad_norm": 0.35215808790406317, "learning_rate": 2.961992368426366e-05, "loss": 0.4112, "num_tokens": 1393930132.0, "step": 5325 }, { "epoch": 2.484848484848485, "grad_norm": 0.33407840882303375, "learning_rate": 2.958538476795104e-05, "loss": 0.4065, "num_tokens": 1395232316.0, "step": 5330 }, { "epoch": 2.4871794871794872, "grad_norm": 0.365448648296931, "learning_rate": 2.9550840894306565e-05, "loss": 0.4144, "num_tokens": 1396543036.0, "step": 5335 }, { "epoch": 2.4895104895104896, "grad_norm": 0.3516522718004562, "learning_rate": 2.9516292145447187e-05, "loss": 0.4036, "num_tokens": 1397853756.0, "step": 5340 }, { "epoch": 2.491841491841492, "grad_norm": 0.3964385897616128, "learning_rate": 2.9481738603501464e-05, "loss": 0.4145, "num_tokens": 1399164476.0, "step": 5345 }, { "epoch": 2.4941724941724943, "grad_norm": 0.3330290617721678, "learning_rate": 2.9447180350609305e-05, "loss": 0.4126, "num_tokens": 1400475196.0, "step": 5350 }, { "epoch": 2.4965034965034967, "grad_norm": 0.3620399941365234, "learning_rate": 2.941261746892187e-05, "loss": 0.4198, "num_tokens": 1401785916.0, "step": 5355 }, { "epoch": 2.4988344988344986, "grad_norm": 0.4031249657041131, "learning_rate": 2.937805004060129e-05, "loss": 0.3909, "num_tokens": 1403096636.0, "step": 5360 }, { "epoch": 2.5011655011655014, "grad_norm": 0.35297787174955664, "learning_rate": 2.9343478147820515e-05, "loss": 0.4161, "num_tokens": 1404398368.0, "step": 5365 }, { "epoch": 2.5034965034965033, "grad_norm": 0.3580720337699733, "learning_rate": 2.9308901872763107e-05, "loss": 0.4119, "num_tokens": 1405709088.0, "step": 5370 }, { "epoch": 2.5058275058275057, "grad_norm": 0.33634655420177195, "learning_rate": 2.927432129762303e-05, "loss": 0.4072, "num_tokens": 1407019808.0, "step": 5375 }, { "epoch": 2.508158508158508, "grad_norm": 0.4092817541410367, "learning_rate": 2.923973650460451e-05, "loss": 0.4249, "num_tokens": 1408330528.0, "step": 5380 }, { "epoch": 2.5104895104895104, "grad_norm": 0.386629174107386, "learning_rate": 2.9205147575921748e-05, "loss": 0.4001, "num_tokens": 1409641248.0, "step": 5385 }, { "epoch": 2.5128205128205128, "grad_norm": 0.38645028662583764, "learning_rate": 2.917055459379881e-05, "loss": 0.4201, "num_tokens": 1410951968.0, "step": 5390 }, { "epoch": 2.515151515151515, "grad_norm": 0.3870366504220812, "learning_rate": 2.9135957640469407e-05, "loss": 0.4013, "num_tokens": 1412262688.0, "step": 5395 }, { "epoch": 2.5174825174825175, "grad_norm": 0.3590149507926292, "learning_rate": 2.9101356798176648e-05, "loss": 0.4281, "num_tokens": 1413573408.0, "step": 5400 }, { "epoch": 2.51981351981352, "grad_norm": 0.3914639948997253, "learning_rate": 2.9066752149172927e-05, "loss": 0.4105, "num_tokens": 1414884128.0, "step": 5405 }, { "epoch": 2.5221445221445222, "grad_norm": 0.3459336521291884, "learning_rate": 2.903214377571967e-05, "loss": 0.4018, "num_tokens": 1416194848.0, "step": 5410 }, { "epoch": 2.5244755244755246, "grad_norm": 0.351270481877377, "learning_rate": 2.8997531760087143e-05, "loss": 0.4256, "num_tokens": 1417505568.0, "step": 5415 }, { "epoch": 2.526806526806527, "grad_norm": 0.3494010823054712, "learning_rate": 2.896291618455431e-05, "loss": 0.412, "num_tokens": 1418816288.0, "step": 5420 }, { "epoch": 2.529137529137529, "grad_norm": 0.3663146601610986, "learning_rate": 2.8928297131408557e-05, "loss": 0.4104, "num_tokens": 1420127008.0, "step": 5425 }, { "epoch": 2.5314685314685317, "grad_norm": 0.39872390845448125, "learning_rate": 2.889367468294556e-05, "loss": 0.4068, "num_tokens": 1421437728.0, "step": 5430 }, { "epoch": 2.5337995337995336, "grad_norm": 0.37140846588281806, "learning_rate": 2.885904892146905e-05, "loss": 0.4189, "num_tokens": 1422748448.0, "step": 5435 }, { "epoch": 2.5361305361305364, "grad_norm": 0.3657604671694247, "learning_rate": 2.8824419929290665e-05, "loss": 0.4191, "num_tokens": 1424059168.0, "step": 5440 }, { "epoch": 2.5384615384615383, "grad_norm": 0.3568520716840305, "learning_rate": 2.878978778872968e-05, "loss": 0.4201, "num_tokens": 1425369888.0, "step": 5445 }, { "epoch": 2.5407925407925407, "grad_norm": 0.39448531507467444, "learning_rate": 2.8755152582112877e-05, "loss": 0.4161, "num_tokens": 1426680608.0, "step": 5450 }, { "epoch": 2.543123543123543, "grad_norm": 0.34606657031969595, "learning_rate": 2.8720514391774333e-05, "loss": 0.4134, "num_tokens": 1427991328.0, "step": 5455 }, { "epoch": 2.5454545454545454, "grad_norm": 0.4053792494656858, "learning_rate": 2.8685873300055206e-05, "loss": 0.402, "num_tokens": 1429302048.0, "step": 5460 }, { "epoch": 2.5477855477855478, "grad_norm": 0.3798273899991343, "learning_rate": 2.8651229389303556e-05, "loss": 0.4133, "num_tokens": 1430612768.0, "step": 5465 }, { "epoch": 2.55011655011655, "grad_norm": 0.35682635054873424, "learning_rate": 2.8616582741874143e-05, "loss": 0.4117, "num_tokens": 1431923488.0, "step": 5470 }, { "epoch": 2.5524475524475525, "grad_norm": 0.3645622618000647, "learning_rate": 2.8581933440128228e-05, "loss": 0.4239, "num_tokens": 1433234208.0, "step": 5475 }, { "epoch": 2.554778554778555, "grad_norm": 0.3703457691159881, "learning_rate": 2.8547281566433393e-05, "loss": 0.4128, "num_tokens": 1434539040.0, "step": 5480 }, { "epoch": 2.5571095571095572, "grad_norm": 0.41270632575142685, "learning_rate": 2.851262720316332e-05, "loss": 0.4095, "num_tokens": 1435849760.0, "step": 5485 }, { "epoch": 2.5594405594405596, "grad_norm": 0.33721292116678364, "learning_rate": 2.8477970432697625e-05, "loss": 0.3976, "num_tokens": 1437160480.0, "step": 5490 }, { "epoch": 2.561771561771562, "grad_norm": 0.3503161770381493, "learning_rate": 2.8443311337421642e-05, "loss": 0.4228, "num_tokens": 1438471200.0, "step": 5495 }, { "epoch": 2.564102564102564, "grad_norm": 0.39093092792942047, "learning_rate": 2.840864999972621e-05, "loss": 0.4102, "num_tokens": 1439768547.0, "step": 5500 }, { "epoch": 2.5664335664335667, "grad_norm": 0.3855562639577423, "learning_rate": 2.8373986502007522e-05, "loss": 0.3962, "num_tokens": 1441079267.0, "step": 5505 }, { "epoch": 2.5687645687645686, "grad_norm": 0.39374706330512194, "learning_rate": 2.833932092666692e-05, "loss": 0.4168, "num_tokens": 1442389987.0, "step": 5510 }, { "epoch": 2.571095571095571, "grad_norm": 0.35307123842027394, "learning_rate": 2.830465335611064e-05, "loss": 0.4109, "num_tokens": 1443700707.0, "step": 5515 }, { "epoch": 2.5734265734265733, "grad_norm": 0.3531292824198428, "learning_rate": 2.826998387274969e-05, "loss": 0.401, "num_tokens": 1445011427.0, "step": 5520 }, { "epoch": 2.5757575757575757, "grad_norm": 0.35841271371937383, "learning_rate": 2.8235312558999634e-05, "loss": 0.3987, "num_tokens": 1446322147.0, "step": 5525 }, { "epoch": 2.578088578088578, "grad_norm": 0.36420778646479857, "learning_rate": 2.820063949728035e-05, "loss": 0.4004, "num_tokens": 1447632867.0, "step": 5530 }, { "epoch": 2.5804195804195804, "grad_norm": 0.34605617089524987, "learning_rate": 2.8165964770015923e-05, "loss": 0.4046, "num_tokens": 1448943587.0, "step": 5535 }, { "epoch": 2.582750582750583, "grad_norm": 0.3638468608455636, "learning_rate": 2.8131288459634358e-05, "loss": 0.4183, "num_tokens": 1450254307.0, "step": 5540 }, { "epoch": 2.585081585081585, "grad_norm": 0.33673390656126306, "learning_rate": 2.8096610648567428e-05, "loss": 0.4052, "num_tokens": 1451549273.0, "step": 5545 }, { "epoch": 2.5874125874125875, "grad_norm": 0.3450093834999504, "learning_rate": 2.806193141925048e-05, "loss": 0.4092, "num_tokens": 1452852946.0, "step": 5550 }, { "epoch": 2.58974358974359, "grad_norm": 0.33375024409509796, "learning_rate": 2.8027250854122245e-05, "loss": 0.4071, "num_tokens": 1454163666.0, "step": 5555 }, { "epoch": 2.5920745920745922, "grad_norm": 0.32434266094123126, "learning_rate": 2.7992569035624612e-05, "loss": 0.4088, "num_tokens": 1455464696.0, "step": 5560 }, { "epoch": 2.594405594405594, "grad_norm": 0.3625165606072977, "learning_rate": 2.795788604620246e-05, "loss": 0.4027, "num_tokens": 1456775416.0, "step": 5565 }, { "epoch": 2.596736596736597, "grad_norm": 0.34850576006189254, "learning_rate": 2.7923201968303427e-05, "loss": 0.4225, "num_tokens": 1458086136.0, "step": 5570 }, { "epoch": 2.599067599067599, "grad_norm": 0.31593994005170517, "learning_rate": 2.788851688437777e-05, "loss": 0.4014, "num_tokens": 1459396856.0, "step": 5575 }, { "epoch": 2.6013986013986012, "grad_norm": 0.34688524294847034, "learning_rate": 2.785383087687813e-05, "loss": 0.4172, "num_tokens": 1460707576.0, "step": 5580 }, { "epoch": 2.6037296037296036, "grad_norm": 0.3557511247705549, "learning_rate": 2.781914402825933e-05, "loss": 0.4143, "num_tokens": 1462018296.0, "step": 5585 }, { "epoch": 2.606060606060606, "grad_norm": 0.3387419936179967, "learning_rate": 2.77844564209782e-05, "loss": 0.4029, "num_tokens": 1463329016.0, "step": 5590 }, { "epoch": 2.6083916083916083, "grad_norm": 0.343630335231053, "learning_rate": 2.77497681374934e-05, "loss": 0.404, "num_tokens": 1464639736.0, "step": 5595 }, { "epoch": 2.6107226107226107, "grad_norm": 0.3332182207020839, "learning_rate": 2.7715079260265124e-05, "loss": 0.4006, "num_tokens": 1465950456.0, "step": 5600 }, { "epoch": 2.613053613053613, "grad_norm": 0.3643825631091346, "learning_rate": 2.7680389871755064e-05, "loss": 0.4097, "num_tokens": 1467261176.0, "step": 5605 }, { "epoch": 2.6153846153846154, "grad_norm": 0.3364764273544578, "learning_rate": 2.7645700054426087e-05, "loss": 0.4033, "num_tokens": 1468571896.0, "step": 5610 }, { "epoch": 2.617715617715618, "grad_norm": 0.34072785097660746, "learning_rate": 2.7611009890742058e-05, "loss": 0.4212, "num_tokens": 1469882616.0, "step": 5615 }, { "epoch": 2.62004662004662, "grad_norm": 0.3574149052487222, "learning_rate": 2.757631946316771e-05, "loss": 0.4154, "num_tokens": 1471193336.0, "step": 5620 }, { "epoch": 2.6223776223776225, "grad_norm": 0.32899936425520643, "learning_rate": 2.754162885416837e-05, "loss": 0.4135, "num_tokens": 1472504056.0, "step": 5625 }, { "epoch": 2.624708624708625, "grad_norm": 0.3263664126860434, "learning_rate": 2.7506938146209816e-05, "loss": 0.4048, "num_tokens": 1473814776.0, "step": 5630 }, { "epoch": 2.6270396270396272, "grad_norm": 0.35274896684534274, "learning_rate": 2.7472247421758046e-05, "loss": 0.3946, "num_tokens": 1475125496.0, "step": 5635 }, { "epoch": 2.629370629370629, "grad_norm": 0.3706959177174659, "learning_rate": 2.743755676327911e-05, "loss": 0.4116, "num_tokens": 1476436216.0, "step": 5640 }, { "epoch": 2.631701631701632, "grad_norm": 0.3391136391587131, "learning_rate": 2.7402866253238896e-05, "loss": 0.4028, "num_tokens": 1477746936.0, "step": 5645 }, { "epoch": 2.634032634032634, "grad_norm": 0.3880994944779498, "learning_rate": 2.7368175974102938e-05, "loss": 0.3995, "num_tokens": 1479057656.0, "step": 5650 }, { "epoch": 2.6363636363636362, "grad_norm": 0.3565909411799108, "learning_rate": 2.7333486008336217e-05, "loss": 0.4089, "num_tokens": 1480368376.0, "step": 5655 }, { "epoch": 2.6386946386946386, "grad_norm": 0.3229561588592343, "learning_rate": 2.7298796438402986e-05, "loss": 0.4108, "num_tokens": 1481679096.0, "step": 5660 }, { "epoch": 2.641025641025641, "grad_norm": 0.3393467174551697, "learning_rate": 2.726410734676653e-05, "loss": 0.4153, "num_tokens": 1482989816.0, "step": 5665 }, { "epoch": 2.6433566433566433, "grad_norm": 0.40533026810505063, "learning_rate": 2.7229418815889023e-05, "loss": 0.427, "num_tokens": 1484300536.0, "step": 5670 }, { "epoch": 2.6456876456876457, "grad_norm": 0.3493290537833516, "learning_rate": 2.7194730928231292e-05, "loss": 0.4233, "num_tokens": 1485603324.0, "step": 5675 }, { "epoch": 2.648018648018648, "grad_norm": 0.3293219859362695, "learning_rate": 2.716004376625264e-05, "loss": 0.4137, "num_tokens": 1486914044.0, "step": 5680 }, { "epoch": 2.6503496503496504, "grad_norm": 0.32917350717509924, "learning_rate": 2.7125357412410634e-05, "loss": 0.4112, "num_tokens": 1488224764.0, "step": 5685 }, { "epoch": 2.652680652680653, "grad_norm": 0.36778219939737017, "learning_rate": 2.7090671949160945e-05, "loss": 0.4151, "num_tokens": 1489530334.0, "step": 5690 }, { "epoch": 2.655011655011655, "grad_norm": 0.3520157165475074, "learning_rate": 2.70559874589571e-05, "loss": 0.4252, "num_tokens": 1490841054.0, "step": 5695 }, { "epoch": 2.6573426573426575, "grad_norm": 0.364402639873204, "learning_rate": 2.7021304024250315e-05, "loss": 0.415, "num_tokens": 1492151774.0, "step": 5700 }, { "epoch": 2.6596736596736594, "grad_norm": 0.3802755602559821, "learning_rate": 2.698662172748933e-05, "loss": 0.4084, "num_tokens": 1493457481.0, "step": 5705 }, { "epoch": 2.6620046620046622, "grad_norm": 0.3296000790473323, "learning_rate": 2.695194065112014e-05, "loss": 0.41, "num_tokens": 1494759113.0, "step": 5710 }, { "epoch": 2.664335664335664, "grad_norm": 0.3427344361950261, "learning_rate": 2.6917260877585854e-05, "loss": 0.4155, "num_tokens": 1496069833.0, "step": 5715 }, { "epoch": 2.6666666666666665, "grad_norm": 0.33635269462764017, "learning_rate": 2.6882582489326485e-05, "loss": 0.4175, "num_tokens": 1497380553.0, "step": 5720 }, { "epoch": 2.668997668997669, "grad_norm": 0.3931298913652689, "learning_rate": 2.6847905568778753e-05, "loss": 0.421, "num_tokens": 1498670997.0, "step": 5725 }, { "epoch": 2.6713286713286712, "grad_norm": 0.35218576187719325, "learning_rate": 2.6813230198375887e-05, "loss": 0.4072, "num_tokens": 1499981717.0, "step": 5730 }, { "epoch": 2.6736596736596736, "grad_norm": 0.35875764676123595, "learning_rate": 2.6778556460547437e-05, "loss": 0.4185, "num_tokens": 1501292437.0, "step": 5735 }, { "epoch": 2.675990675990676, "grad_norm": 0.36986425256659194, "learning_rate": 2.6743884437719064e-05, "loss": 0.4052, "num_tokens": 1502603157.0, "step": 5740 }, { "epoch": 2.6783216783216783, "grad_norm": 0.32481671854646, "learning_rate": 2.6709214212312362e-05, "loss": 0.4175, "num_tokens": 1503913877.0, "step": 5745 }, { "epoch": 2.6806526806526807, "grad_norm": 0.33341909508902867, "learning_rate": 2.6674545866744627e-05, "loss": 0.4095, "num_tokens": 1505212841.0, "step": 5750 }, { "epoch": 2.682983682983683, "grad_norm": 0.3247256800202665, "learning_rate": 2.663987948342873e-05, "loss": 0.3978, "num_tokens": 1506523561.0, "step": 5755 }, { "epoch": 2.6853146853146854, "grad_norm": 0.3693723379792467, "learning_rate": 2.6605215144772844e-05, "loss": 0.4031, "num_tokens": 1507834281.0, "step": 5760 }, { "epoch": 2.687645687645688, "grad_norm": 0.3363247073452088, "learning_rate": 2.6570552933180275e-05, "loss": 0.4096, "num_tokens": 1509145001.0, "step": 5765 }, { "epoch": 2.6899766899766897, "grad_norm": 0.3686548048692657, "learning_rate": 2.6535892931049304e-05, "loss": 0.3989, "num_tokens": 1510455721.0, "step": 5770 }, { "epoch": 2.6923076923076925, "grad_norm": 0.3590538341136808, "learning_rate": 2.650123522077294e-05, "loss": 0.4126, "num_tokens": 1511756221.0, "step": 5775 }, { "epoch": 2.6946386946386944, "grad_norm": 0.3939246015397987, "learning_rate": 2.6466579884738745e-05, "loss": 0.4127, "num_tokens": 1513066941.0, "step": 5780 }, { "epoch": 2.6969696969696972, "grad_norm": 0.36656521638788236, "learning_rate": 2.6431927005328634e-05, "loss": 0.4118, "num_tokens": 1514377661.0, "step": 5785 }, { "epoch": 2.699300699300699, "grad_norm": 0.3786768997732052, "learning_rate": 2.6397276664918695e-05, "loss": 0.4056, "num_tokens": 1515688381.0, "step": 5790 }, { "epoch": 2.7016317016317015, "grad_norm": 0.34250789196379766, "learning_rate": 2.6362628945878982e-05, "loss": 0.4237, "num_tokens": 1516983122.0, "step": 5795 }, { "epoch": 2.703962703962704, "grad_norm": 0.3879698601925601, "learning_rate": 2.6327983930573275e-05, "loss": 0.4194, "num_tokens": 1518293842.0, "step": 5800 }, { "epoch": 2.7062937062937062, "grad_norm": 0.34322275404532027, "learning_rate": 2.629334170135899e-05, "loss": 0.421, "num_tokens": 1519604562.0, "step": 5805 }, { "epoch": 2.7086247086247086, "grad_norm": 0.3274763994589238, "learning_rate": 2.6258702340586888e-05, "loss": 0.3991, "num_tokens": 1520915282.0, "step": 5810 }, { "epoch": 2.710955710955711, "grad_norm": 0.3313466614418822, "learning_rate": 2.6224065930600895e-05, "loss": 0.4114, "num_tokens": 1522226002.0, "step": 5815 }, { "epoch": 2.7132867132867133, "grad_norm": 0.3577956678121881, "learning_rate": 2.6189432553737965e-05, "loss": 0.4313, "num_tokens": 1523536722.0, "step": 5820 }, { "epoch": 2.7156177156177157, "grad_norm": 0.34556719467314156, "learning_rate": 2.6154802292327795e-05, "loss": 0.4179, "num_tokens": 1524847442.0, "step": 5825 }, { "epoch": 2.717948717948718, "grad_norm": 0.3422225730154786, "learning_rate": 2.6120175228692705e-05, "loss": 0.4224, "num_tokens": 1526145343.0, "step": 5830 }, { "epoch": 2.7202797202797204, "grad_norm": 0.3310291884231872, "learning_rate": 2.608555144514741e-05, "loss": 0.4104, "num_tokens": 1527456063.0, "step": 5835 }, { "epoch": 2.722610722610723, "grad_norm": 0.35527150456435, "learning_rate": 2.6050931023998825e-05, "loss": 0.4265, "num_tokens": 1528762674.0, "step": 5840 }, { "epoch": 2.7249417249417247, "grad_norm": 0.3810983867864767, "learning_rate": 2.601631404754587e-05, "loss": 0.4156, "num_tokens": 1530058844.0, "step": 5845 }, { "epoch": 2.7272727272727275, "grad_norm": 0.3920073125528213, "learning_rate": 2.5981700598079267e-05, "loss": 0.4202, "num_tokens": 1531358947.0, "step": 5850 }, { "epoch": 2.7296037296037294, "grad_norm": 0.3259366792942078, "learning_rate": 2.594709075788138e-05, "loss": 0.414, "num_tokens": 1532660121.0, "step": 5855 }, { "epoch": 2.731934731934732, "grad_norm": 0.37607877851481675, "learning_rate": 2.5912484609225973e-05, "loss": 0.4125, "num_tokens": 1533961019.0, "step": 5860 }, { "epoch": 2.734265734265734, "grad_norm": 0.361401132584504, "learning_rate": 2.5877882234378027e-05, "loss": 0.4149, "num_tokens": 1535271739.0, "step": 5865 }, { "epoch": 2.7365967365967365, "grad_norm": 0.3814825573434443, "learning_rate": 2.584328371559358e-05, "loss": 0.4171, "num_tokens": 1536582459.0, "step": 5870 }, { "epoch": 2.738927738927739, "grad_norm": 0.33196554110199666, "learning_rate": 2.5808689135119484e-05, "loss": 0.4198, "num_tokens": 1537893179.0, "step": 5875 }, { "epoch": 2.7412587412587412, "grad_norm": 0.3514505419081785, "learning_rate": 2.577409857519323e-05, "loss": 0.4116, "num_tokens": 1539203899.0, "step": 5880 }, { "epoch": 2.7435897435897436, "grad_norm": 0.36051816664386027, "learning_rate": 2.573951211804274e-05, "loss": 0.3955, "num_tokens": 1540501469.0, "step": 5885 }, { "epoch": 2.745920745920746, "grad_norm": 0.34431578542179486, "learning_rate": 2.570492984588622e-05, "loss": 0.4048, "num_tokens": 1541807849.0, "step": 5890 }, { "epoch": 2.7482517482517483, "grad_norm": 0.3482952718749119, "learning_rate": 2.56703518409319e-05, "loss": 0.421, "num_tokens": 1543118569.0, "step": 5895 }, { "epoch": 2.7505827505827507, "grad_norm": 0.3435903033996109, "learning_rate": 2.5635778185377846e-05, "loss": 0.4105, "num_tokens": 1544429289.0, "step": 5900 }, { "epoch": 2.752913752913753, "grad_norm": 0.3649183613089683, "learning_rate": 2.5601208961411838e-05, "loss": 0.4363, "num_tokens": 1545740009.0, "step": 5905 }, { "epoch": 2.755244755244755, "grad_norm": 0.33542477596319575, "learning_rate": 2.556664425121108e-05, "loss": 0.417, "num_tokens": 1547050729.0, "step": 5910 }, { "epoch": 2.757575757575758, "grad_norm": 0.3612431221112293, "learning_rate": 2.5532084136942048e-05, "loss": 0.4106, "num_tokens": 1548361449.0, "step": 5915 }, { "epoch": 2.7599067599067597, "grad_norm": 0.34457273412289285, "learning_rate": 2.5497528700760333e-05, "loss": 0.4111, "num_tokens": 1549672169.0, "step": 5920 }, { "epoch": 2.762237762237762, "grad_norm": 0.3643149061193825, "learning_rate": 2.5462978024810347e-05, "loss": 0.4007, "num_tokens": 1550982889.0, "step": 5925 }, { "epoch": 2.7645687645687644, "grad_norm": 0.33113278908341853, "learning_rate": 2.5428432191225226e-05, "loss": 0.4115, "num_tokens": 1552293609.0, "step": 5930 }, { "epoch": 2.766899766899767, "grad_norm": 0.324549859393571, "learning_rate": 2.5393891282126576e-05, "loss": 0.4147, "num_tokens": 1553603846.0, "step": 5935 }, { "epoch": 2.769230769230769, "grad_norm": 0.34583557264006637, "learning_rate": 2.5359355379624317e-05, "loss": 0.4159, "num_tokens": 1554898212.0, "step": 5940 }, { "epoch": 2.7715617715617715, "grad_norm": 0.3704574170132862, "learning_rate": 2.532482456581644e-05, "loss": 0.4187, "num_tokens": 1556204437.0, "step": 5945 }, { "epoch": 2.773892773892774, "grad_norm": 0.3427794333227205, "learning_rate": 2.529029892278886e-05, "loss": 0.4052, "num_tokens": 1557515157.0, "step": 5950 }, { "epoch": 2.7762237762237763, "grad_norm": 0.3533422903029431, "learning_rate": 2.5255778532615194e-05, "loss": 0.4092, "num_tokens": 1558825877.0, "step": 5955 }, { "epoch": 2.7785547785547786, "grad_norm": 0.3549142246233924, "learning_rate": 2.5221263477356572e-05, "loss": 0.4081, "num_tokens": 1560136597.0, "step": 5960 }, { "epoch": 2.780885780885781, "grad_norm": 0.38274212191913465, "learning_rate": 2.5186753839061438e-05, "loss": 0.4038, "num_tokens": 1561447317.0, "step": 5965 }, { "epoch": 2.7832167832167833, "grad_norm": 0.33786865946250155, "learning_rate": 2.5152249699765367e-05, "loss": 0.4018, "num_tokens": 1562758037.0, "step": 5970 }, { "epoch": 2.7855477855477857, "grad_norm": 0.37594129759116907, "learning_rate": 2.5117751141490858e-05, "loss": 0.4275, "num_tokens": 1564068757.0, "step": 5975 }, { "epoch": 2.787878787878788, "grad_norm": 0.3547780778843013, "learning_rate": 2.5083258246247144e-05, "loss": 0.4107, "num_tokens": 1565366132.0, "step": 5980 }, { "epoch": 2.79020979020979, "grad_norm": 0.336676677250395, "learning_rate": 2.5048771096029976e-05, "loss": 0.4228, "num_tokens": 1566676852.0, "step": 5985 }, { "epoch": 2.792540792540793, "grad_norm": 0.37105836604705056, "learning_rate": 2.5014289772821486e-05, "loss": 0.4141, "num_tokens": 1567987572.0, "step": 5990 }, { "epoch": 2.7948717948717947, "grad_norm": 0.3641465643333037, "learning_rate": 2.4979814358589944e-05, "loss": 0.4268, "num_tokens": 1569298292.0, "step": 5995 }, { "epoch": 2.797202797202797, "grad_norm": 0.39906084842919354, "learning_rate": 2.494534493528952e-05, "loss": 0.4249, "num_tokens": 1570609012.0, "step": 6000 }, { "epoch": 2.7995337995337994, "grad_norm": 0.33358892565041576, "learning_rate": 2.491088158486024e-05, "loss": 0.3972, "num_tokens": 1571908251.0, "step": 6005 }, { "epoch": 2.801864801864802, "grad_norm": 0.3597994521954223, "learning_rate": 2.487642438922761e-05, "loss": 0.3987, "num_tokens": 1573218971.0, "step": 6010 }, { "epoch": 2.804195804195804, "grad_norm": 0.32444840115208157, "learning_rate": 2.484197343030253e-05, "loss": 0.4103, "num_tokens": 1574529691.0, "step": 6015 }, { "epoch": 2.8065268065268065, "grad_norm": 0.3369493662408065, "learning_rate": 2.48075287899811e-05, "loss": 0.4077, "num_tokens": 1575834151.0, "step": 6020 }, { "epoch": 2.808857808857809, "grad_norm": 0.35772585365072584, "learning_rate": 2.4773090550144366e-05, "loss": 0.4176, "num_tokens": 1577144871.0, "step": 6025 }, { "epoch": 2.8111888111888113, "grad_norm": 0.36110473631062134, "learning_rate": 2.473865879265817e-05, "loss": 0.4253, "num_tokens": 1578449314.0, "step": 6030 }, { "epoch": 2.8135198135198136, "grad_norm": 0.33350627178900166, "learning_rate": 2.470423359937295e-05, "loss": 0.4142, "num_tokens": 1579760034.0, "step": 6035 }, { "epoch": 2.815850815850816, "grad_norm": 0.3597334739451086, "learning_rate": 2.4669815052123534e-05, "loss": 0.4125, "num_tokens": 1581060841.0, "step": 6040 }, { "epoch": 2.8181818181818183, "grad_norm": 0.39863743568305726, "learning_rate": 2.463540323272896e-05, "loss": 0.4161, "num_tokens": 1582371561.0, "step": 6045 }, { "epoch": 2.8205128205128203, "grad_norm": 0.3596208010960054, "learning_rate": 2.4600998222992257e-05, "loss": 0.4126, "num_tokens": 1583682281.0, "step": 6050 }, { "epoch": 2.822843822843823, "grad_norm": 0.3542098381141003, "learning_rate": 2.456660010470028e-05, "loss": 0.4164, "num_tokens": 1584993001.0, "step": 6055 }, { "epoch": 2.825174825174825, "grad_norm": 0.33689485990156454, "learning_rate": 2.4532208959623488e-05, "loss": 0.3965, "num_tokens": 1586303721.0, "step": 6060 }, { "epoch": 2.8275058275058274, "grad_norm": 0.36681615470748274, "learning_rate": 2.4497824869515773e-05, "loss": 0.4268, "num_tokens": 1587614441.0, "step": 6065 }, { "epoch": 2.8298368298368297, "grad_norm": 0.33853074905875574, "learning_rate": 2.4463447916114273e-05, "loss": 0.4105, "num_tokens": 1588909813.0, "step": 6070 }, { "epoch": 2.832167832167832, "grad_norm": 0.32550466329208894, "learning_rate": 2.4429078181139127e-05, "loss": 0.4083, "num_tokens": 1590220533.0, "step": 6075 }, { "epoch": 2.8344988344988344, "grad_norm": 0.3234109896716426, "learning_rate": 2.439471574629333e-05, "loss": 0.4162, "num_tokens": 1591531253.0, "step": 6080 }, { "epoch": 2.836829836829837, "grad_norm": 0.32857277817721126, "learning_rate": 2.4360360693262524e-05, "loss": 0.4077, "num_tokens": 1592841973.0, "step": 6085 }, { "epoch": 2.839160839160839, "grad_norm": 0.37570041632784523, "learning_rate": 2.4326013103714813e-05, "loss": 0.4081, "num_tokens": 1594152693.0, "step": 6090 }, { "epoch": 2.8414918414918415, "grad_norm": 0.36498566644592234, "learning_rate": 2.4291673059300546e-05, "loss": 0.4101, "num_tokens": 1595463413.0, "step": 6095 }, { "epoch": 2.843822843822844, "grad_norm": 0.33322954386185466, "learning_rate": 2.4257340641652115e-05, "loss": 0.4203, "num_tokens": 1596774133.0, "step": 6100 }, { "epoch": 2.8461538461538463, "grad_norm": 0.32749325978795546, "learning_rate": 2.4223015932383842e-05, "loss": 0.41, "num_tokens": 1598084853.0, "step": 6105 }, { "epoch": 2.8484848484848486, "grad_norm": 0.37491209566738504, "learning_rate": 2.4188699013091665e-05, "loss": 0.4288, "num_tokens": 1599395573.0, "step": 6110 }, { "epoch": 2.8508158508158505, "grad_norm": 0.34071317068951223, "learning_rate": 2.4154389965353025e-05, "loss": 0.4114, "num_tokens": 1600702382.0, "step": 6115 }, { "epoch": 2.8531468531468533, "grad_norm": 0.3564070027232447, "learning_rate": 2.4120088870726675e-05, "loss": 0.4269, "num_tokens": 1602013102.0, "step": 6120 }, { "epoch": 2.8554778554778553, "grad_norm": 0.37329758310001454, "learning_rate": 2.408579581075242e-05, "loss": 0.4311, "num_tokens": 1603321783.0, "step": 6125 }, { "epoch": 2.857808857808858, "grad_norm": 0.3300871433307017, "learning_rate": 2.4051510866950987e-05, "loss": 0.431, "num_tokens": 1604632503.0, "step": 6130 }, { "epoch": 2.86013986013986, "grad_norm": 0.33854849401601644, "learning_rate": 2.4017234120823816e-05, "loss": 0.4085, "num_tokens": 1605943223.0, "step": 6135 }, { "epoch": 2.8624708624708624, "grad_norm": 0.3397431033223637, "learning_rate": 2.3982965653852845e-05, "loss": 0.427, "num_tokens": 1607238436.0, "step": 6140 }, { "epoch": 2.8648018648018647, "grad_norm": 0.31163913161758244, "learning_rate": 2.3948705547500346e-05, "loss": 0.396, "num_tokens": 1608549156.0, "step": 6145 }, { "epoch": 2.867132867132867, "grad_norm": 0.36119797833011485, "learning_rate": 2.391445388320869e-05, "loss": 0.4219, "num_tokens": 1609859876.0, "step": 6150 }, { "epoch": 2.8694638694638694, "grad_norm": 0.35324575640930755, "learning_rate": 2.388021074240021e-05, "loss": 0.4045, "num_tokens": 1611170596.0, "step": 6155 }, { "epoch": 2.871794871794872, "grad_norm": 0.34168122235530196, "learning_rate": 2.3845976206476962e-05, "loss": 0.4119, "num_tokens": 1612481316.0, "step": 6160 }, { "epoch": 2.874125874125874, "grad_norm": 0.3267756311441551, "learning_rate": 2.381175035682055e-05, "loss": 0.4068, "num_tokens": 1613780656.0, "step": 6165 }, { "epoch": 2.8764568764568765, "grad_norm": 0.33042098367758244, "learning_rate": 2.377753327479193e-05, "loss": 0.4063, "num_tokens": 1615091376.0, "step": 6170 }, { "epoch": 2.878787878787879, "grad_norm": 0.3484728853535675, "learning_rate": 2.374332504173121e-05, "loss": 0.4062, "num_tokens": 1616402096.0, "step": 6175 }, { "epoch": 2.8811188811188813, "grad_norm": 0.3454455029684927, "learning_rate": 2.3709125738957467e-05, "loss": 0.4047, "num_tokens": 1617712816.0, "step": 6180 }, { "epoch": 2.8834498834498836, "grad_norm": 0.35256247777872834, "learning_rate": 2.3674935447768547e-05, "loss": 0.4092, "num_tokens": 1619023536.0, "step": 6185 }, { "epoch": 2.8857808857808855, "grad_norm": 0.34447264412116185, "learning_rate": 2.3640754249440893e-05, "loss": 0.4171, "num_tokens": 1620320855.0, "step": 6190 }, { "epoch": 2.8881118881118883, "grad_norm": 0.3300552790749895, "learning_rate": 2.360658222522929e-05, "loss": 0.404, "num_tokens": 1621631575.0, "step": 6195 }, { "epoch": 2.8904428904428903, "grad_norm": 0.3301478105077118, "learning_rate": 2.357241945636674e-05, "loss": 0.4041, "num_tokens": 1622942295.0, "step": 6200 }, { "epoch": 2.8927738927738926, "grad_norm": 0.3241286505711926, "learning_rate": 2.3538266024064272e-05, "loss": 0.4088, "num_tokens": 1624236677.0, "step": 6205 }, { "epoch": 2.895104895104895, "grad_norm": 0.3393889730654891, "learning_rate": 2.350412200951066e-05, "loss": 0.4045, "num_tokens": 1625547397.0, "step": 6210 }, { "epoch": 2.8974358974358974, "grad_norm": 0.3722971080352807, "learning_rate": 2.346998749387233e-05, "loss": 0.3972, "num_tokens": 1626858117.0, "step": 6215 }, { "epoch": 2.8997668997668997, "grad_norm": 0.3481952783229655, "learning_rate": 2.3435862558293137e-05, "loss": 0.4185, "num_tokens": 1628168837.0, "step": 6220 }, { "epoch": 2.902097902097902, "grad_norm": 0.3575311095008992, "learning_rate": 2.3401747283894122e-05, "loss": 0.4089, "num_tokens": 1629479557.0, "step": 6225 }, { "epoch": 2.9044289044289044, "grad_norm": 0.3213929542319523, "learning_rate": 2.3367641751773388e-05, "loss": 0.4044, "num_tokens": 1630783572.0, "step": 6230 }, { "epoch": 2.906759906759907, "grad_norm": 0.33775314492186553, "learning_rate": 2.3333546043005877e-05, "loss": 0.4117, "num_tokens": 1632094292.0, "step": 6235 }, { "epoch": 2.909090909090909, "grad_norm": 0.35583582490206767, "learning_rate": 2.3299460238643178e-05, "loss": 0.4191, "num_tokens": 1633405012.0, "step": 6240 }, { "epoch": 2.9114219114219115, "grad_norm": 0.3403055493539477, "learning_rate": 2.3265384419713325e-05, "loss": 0.4074, "num_tokens": 1634715732.0, "step": 6245 }, { "epoch": 2.913752913752914, "grad_norm": 0.32361564388685027, "learning_rate": 2.3231318667220624e-05, "loss": 0.4047, "num_tokens": 1636021018.0, "step": 6250 }, { "epoch": 2.916083916083916, "grad_norm": 0.36911990299906383, "learning_rate": 2.3197263062145457e-05, "loss": 0.3952, "num_tokens": 1637331738.0, "step": 6255 }, { "epoch": 2.9184149184149186, "grad_norm": 0.3529888818933417, "learning_rate": 2.3163217685444067e-05, "loss": 0.4037, "num_tokens": 1638640505.0, "step": 6260 }, { "epoch": 2.9207459207459205, "grad_norm": 0.3228114643758619, "learning_rate": 2.312918261804839e-05, "loss": 0.4039, "num_tokens": 1639951225.0, "step": 6265 }, { "epoch": 2.9230769230769234, "grad_norm": 0.37676066453773155, "learning_rate": 2.3095157940865876e-05, "loss": 0.408, "num_tokens": 1641261945.0, "step": 6270 }, { "epoch": 2.9254079254079253, "grad_norm": 0.3498311618012844, "learning_rate": 2.3061143734779235e-05, "loss": 0.4052, "num_tokens": 1642572665.0, "step": 6275 }, { "epoch": 2.9277389277389276, "grad_norm": 0.3793591470148143, "learning_rate": 2.3027140080646313e-05, "loss": 0.4059, "num_tokens": 1643883385.0, "step": 6280 }, { "epoch": 2.93006993006993, "grad_norm": 0.30604459853495847, "learning_rate": 2.299314705929987e-05, "loss": 0.4008, "num_tokens": 1645194105.0, "step": 6285 }, { "epoch": 2.9324009324009324, "grad_norm": 0.3471040950462663, "learning_rate": 2.295916475154739e-05, "loss": 0.4152, "num_tokens": 1646501490.0, "step": 6290 }, { "epoch": 2.9347319347319347, "grad_norm": 0.3497693139520713, "learning_rate": 2.292519323817087e-05, "loss": 0.4247, "num_tokens": 1647803839.0, "step": 6295 }, { "epoch": 2.937062937062937, "grad_norm": 0.37949081912769744, "learning_rate": 2.2891232599926666e-05, "loss": 0.4094, "num_tokens": 1649114559.0, "step": 6300 }, { "epoch": 2.9393939393939394, "grad_norm": 0.3885455375827252, "learning_rate": 2.2857282917545285e-05, "loss": 0.4188, "num_tokens": 1650425279.0, "step": 6305 }, { "epoch": 2.941724941724942, "grad_norm": 0.3477949975861951, "learning_rate": 2.2823344271731184e-05, "loss": 0.4176, "num_tokens": 1651731670.0, "step": 6310 }, { "epoch": 2.944055944055944, "grad_norm": 0.4148827072433097, "learning_rate": 2.2789416743162567e-05, "loss": 0.4097, "num_tokens": 1653042390.0, "step": 6315 }, { "epoch": 2.9463869463869465, "grad_norm": 0.34732540637824916, "learning_rate": 2.275550041249124e-05, "loss": 0.4112, "num_tokens": 1654353110.0, "step": 6320 }, { "epoch": 2.948717948717949, "grad_norm": 0.32571968477635, "learning_rate": 2.272159536034238e-05, "loss": 0.4127, "num_tokens": 1655663830.0, "step": 6325 }, { "epoch": 2.951048951048951, "grad_norm": 0.34646771087112505, "learning_rate": 2.2687701667314327e-05, "loss": 0.4042, "num_tokens": 1656974550.0, "step": 6330 }, { "epoch": 2.9533799533799536, "grad_norm": 0.3649182836596108, "learning_rate": 2.2653819413978454e-05, "loss": 0.3955, "num_tokens": 1658285270.0, "step": 6335 }, { "epoch": 2.9557109557109555, "grad_norm": 0.36915878506354805, "learning_rate": 2.261994868087893e-05, "loss": 0.4217, "num_tokens": 1659595990.0, "step": 6340 }, { "epoch": 2.958041958041958, "grad_norm": 0.34333413093504295, "learning_rate": 2.258608954853252e-05, "loss": 0.4019, "num_tokens": 1660899543.0, "step": 6345 }, { "epoch": 2.9603729603729603, "grad_norm": 0.3548268496609685, "learning_rate": 2.2552242097428432e-05, "loss": 0.4111, "num_tokens": 1662198615.0, "step": 6350 }, { "epoch": 2.9627039627039626, "grad_norm": 0.33885473039293307, "learning_rate": 2.2518406408028108e-05, "loss": 0.4136, "num_tokens": 1663509335.0, "step": 6355 }, { "epoch": 2.965034965034965, "grad_norm": 0.31938033848179553, "learning_rate": 2.2484582560765012e-05, "loss": 0.4059, "num_tokens": 1664820055.0, "step": 6360 }, { "epoch": 2.9673659673659674, "grad_norm": 0.33392158191585564, "learning_rate": 2.245077063604446e-05, "loss": 0.4011, "num_tokens": 1666130775.0, "step": 6365 }, { "epoch": 2.9696969696969697, "grad_norm": 0.31402160061675133, "learning_rate": 2.241697071424345e-05, "loss": 0.4065, "num_tokens": 1667441495.0, "step": 6370 }, { "epoch": 2.972027972027972, "grad_norm": 0.3762934050333541, "learning_rate": 2.2383182875710424e-05, "loss": 0.4137, "num_tokens": 1668752215.0, "step": 6375 }, { "epoch": 2.9743589743589745, "grad_norm": 0.34874468121941143, "learning_rate": 2.23494072007651e-05, "loss": 0.4089, "num_tokens": 1670062935.0, "step": 6380 }, { "epoch": 2.976689976689977, "grad_norm": 0.31091397228227735, "learning_rate": 2.231564376969829e-05, "loss": 0.4169, "num_tokens": 1671372884.0, "step": 6385 }, { "epoch": 2.979020979020979, "grad_norm": 0.34721881259745, "learning_rate": 2.2281892662771703e-05, "loss": 0.4073, "num_tokens": 1672668051.0, "step": 6390 }, { "epoch": 2.981351981351981, "grad_norm": 0.32893411375726883, "learning_rate": 2.224815396021772e-05, "loss": 0.4115, "num_tokens": 1673978771.0, "step": 6395 }, { "epoch": 2.983682983682984, "grad_norm": 0.3148206325892906, "learning_rate": 2.221442774223929e-05, "loss": 0.3946, "num_tokens": 1675289491.0, "step": 6400 }, { "epoch": 2.986013986013986, "grad_norm": 0.33342285443614306, "learning_rate": 2.2180714089009652e-05, "loss": 0.3874, "num_tokens": 1676600211.0, "step": 6405 }, { "epoch": 2.988344988344988, "grad_norm": 0.3144256371115021, "learning_rate": 2.214701308067216e-05, "loss": 0.4006, "num_tokens": 1677910931.0, "step": 6410 }, { "epoch": 2.9906759906759905, "grad_norm": 0.33559767396859935, "learning_rate": 2.211332479734013e-05, "loss": 0.4079, "num_tokens": 1679221651.0, "step": 6415 }, { "epoch": 2.993006993006993, "grad_norm": 0.3406414559616791, "learning_rate": 2.207964931909663e-05, "loss": 0.3905, "num_tokens": 1680532371.0, "step": 6420 }, { "epoch": 2.9953379953379953, "grad_norm": 0.34257530422603977, "learning_rate": 2.2045986725994287e-05, "loss": 0.4173, "num_tokens": 1681843091.0, "step": 6425 }, { "epoch": 2.9976689976689976, "grad_norm": 0.36116021112956004, "learning_rate": 2.2012337098055086e-05, "loss": 0.4182, "num_tokens": 1683153811.0, "step": 6430 }, { "epoch": 3.0, "grad_norm": 0.33421231681330704, "learning_rate": 2.19787005152702e-05, "loss": 0.3941, "num_tokens": 1684464531.0, "step": 6435 }, { "epoch": 3.0023310023310024, "grad_norm": 0.3393950285548961, "learning_rate": 2.1945077057599804e-05, "loss": 0.3565, "num_tokens": 1685775251.0, "step": 6440 }, { "epoch": 3.0046620046620047, "grad_norm": 0.3640517357882801, "learning_rate": 2.191146680497284e-05, "loss": 0.3643, "num_tokens": 1687076615.0, "step": 6445 }, { "epoch": 3.006993006993007, "grad_norm": 0.3644950262571567, "learning_rate": 2.1877869837286896e-05, "loss": 0.3655, "num_tokens": 1688387335.0, "step": 6450 }, { "epoch": 3.0093240093240095, "grad_norm": 0.36701056648807145, "learning_rate": 2.1844286234407947e-05, "loss": 0.3499, "num_tokens": 1689698055.0, "step": 6455 }, { "epoch": 3.011655011655012, "grad_norm": 0.3865682129879495, "learning_rate": 2.181071607617022e-05, "loss": 0.3712, "num_tokens": 1691008775.0, "step": 6460 }, { "epoch": 3.013986013986014, "grad_norm": 0.33558912130902724, "learning_rate": 2.1777159442375967e-05, "loss": 0.362, "num_tokens": 1692319495.0, "step": 6465 }, { "epoch": 3.016317016317016, "grad_norm": 0.3541099475736384, "learning_rate": 2.1743616412795303e-05, "loss": 0.3473, "num_tokens": 1693630024.0, "step": 6470 }, { "epoch": 3.0186480186480185, "grad_norm": 0.3374133971809518, "learning_rate": 2.1710087067165998e-05, "loss": 0.3659, "num_tokens": 1694940744.0, "step": 6475 }, { "epoch": 3.020979020979021, "grad_norm": 0.31181861303337255, "learning_rate": 2.1676571485193282e-05, "loss": 0.367, "num_tokens": 1696244679.0, "step": 6480 }, { "epoch": 3.023310023310023, "grad_norm": 0.34475890728017977, "learning_rate": 2.1643069746549694e-05, "loss": 0.3575, "num_tokens": 1697555399.0, "step": 6485 }, { "epoch": 3.0256410256410255, "grad_norm": 0.3467950380584418, "learning_rate": 2.1609581930874835e-05, "loss": 0.3531, "num_tokens": 1698866119.0, "step": 6490 }, { "epoch": 3.027972027972028, "grad_norm": 0.3620598508450458, "learning_rate": 2.1576108117775205e-05, "loss": 0.3685, "num_tokens": 1700166619.0, "step": 6495 }, { "epoch": 3.0303030303030303, "grad_norm": 0.32443886801232974, "learning_rate": 2.154264838682407e-05, "loss": 0.3438, "num_tokens": 1701477339.0, "step": 6500 }, { "epoch": 3.0326340326340326, "grad_norm": 0.3529808773793806, "learning_rate": 2.1509202817561164e-05, "loss": 0.3613, "num_tokens": 1702788059.0, "step": 6505 }, { "epoch": 3.034965034965035, "grad_norm": 0.34696686691787276, "learning_rate": 2.1475771489492567e-05, "loss": 0.3548, "num_tokens": 1704098779.0, "step": 6510 }, { "epoch": 3.0372960372960374, "grad_norm": 0.35892737626187393, "learning_rate": 2.144235448209052e-05, "loss": 0.3546, "num_tokens": 1705409499.0, "step": 6515 }, { "epoch": 3.0396270396270397, "grad_norm": 0.3442123945875795, "learning_rate": 2.140895187479322e-05, "loss": 0.3461, "num_tokens": 1706707069.0, "step": 6520 }, { "epoch": 3.041958041958042, "grad_norm": 0.3173856196639699, "learning_rate": 2.137556374700463e-05, "loss": 0.3513, "num_tokens": 1708017789.0, "step": 6525 }, { "epoch": 3.0442890442890445, "grad_norm": 0.34717882632048175, "learning_rate": 2.1342190178094267e-05, "loss": 0.3616, "num_tokens": 1709324169.0, "step": 6530 }, { "epoch": 3.046620046620047, "grad_norm": 0.36986993805338514, "learning_rate": 2.1308831247397094e-05, "loss": 0.3543, "num_tokens": 1710634889.0, "step": 6535 }, { "epoch": 3.0489510489510487, "grad_norm": 0.3537014861009382, "learning_rate": 2.1275487034213227e-05, "loss": 0.3434, "num_tokens": 1711945609.0, "step": 6540 }, { "epoch": 3.051282051282051, "grad_norm": 0.3753264100076296, "learning_rate": 2.1242157617807807e-05, "loss": 0.3509, "num_tokens": 1713256329.0, "step": 6545 }, { "epoch": 3.0536130536130535, "grad_norm": 0.3526114805968593, "learning_rate": 2.1208843077410816e-05, "loss": 0.3542, "num_tokens": 1714567049.0, "step": 6550 }, { "epoch": 3.055944055944056, "grad_norm": 0.32727730145454015, "learning_rate": 2.117554349221687e-05, "loss": 0.3536, "num_tokens": 1715877769.0, "step": 6555 }, { "epoch": 3.058275058275058, "grad_norm": 0.3437242965711344, "learning_rate": 2.1142258941385012e-05, "loss": 0.3525, "num_tokens": 1717182601.0, "step": 6560 }, { "epoch": 3.0606060606060606, "grad_norm": 0.33805241257757357, "learning_rate": 2.1108989504038567e-05, "loss": 0.3603, "num_tokens": 1718493321.0, "step": 6565 }, { "epoch": 3.062937062937063, "grad_norm": 0.3645037205678424, "learning_rate": 2.1075735259264935e-05, "loss": 0.3576, "num_tokens": 1719804041.0, "step": 6570 }, { "epoch": 3.0652680652680653, "grad_norm": 0.33019158555715583, "learning_rate": 2.1042496286115383e-05, "loss": 0.3455, "num_tokens": 1721114761.0, "step": 6575 }, { "epoch": 3.0675990675990676, "grad_norm": 0.35114211762624814, "learning_rate": 2.100927266360487e-05, "loss": 0.3624, "num_tokens": 1722425481.0, "step": 6580 }, { "epoch": 3.06993006993007, "grad_norm": 0.341780681859243, "learning_rate": 2.0976064470711908e-05, "loss": 0.3487, "num_tokens": 1723733066.0, "step": 6585 }, { "epoch": 3.0722610722610724, "grad_norm": 0.31101561100825387, "learning_rate": 2.0942871786378283e-05, "loss": 0.3424, "num_tokens": 1725043786.0, "step": 6590 }, { "epoch": 3.0745920745920747, "grad_norm": 0.3413708055501447, "learning_rate": 2.090969468950892e-05, "loss": 0.3449, "num_tokens": 1726347459.0, "step": 6595 }, { "epoch": 3.076923076923077, "grad_norm": 0.3631157917623202, "learning_rate": 2.087653325897172e-05, "loss": 0.3533, "num_tokens": 1727658179.0, "step": 6600 }, { "epoch": 3.0792540792540795, "grad_norm": 0.35244419005544364, "learning_rate": 2.0843387573597324e-05, "loss": 0.3594, "num_tokens": 1728968899.0, "step": 6605 }, { "epoch": 3.0815850815850814, "grad_norm": 0.3705036974511524, "learning_rate": 2.0810257712178914e-05, "loss": 0.3519, "num_tokens": 1730278354.0, "step": 6610 }, { "epoch": 3.0839160839160837, "grad_norm": 0.37121857812051107, "learning_rate": 2.077714375347213e-05, "loss": 0.344, "num_tokens": 1731589074.0, "step": 6615 }, { "epoch": 3.086247086247086, "grad_norm": 0.5821507533226011, "learning_rate": 2.074404577619472e-05, "loss": 0.3588, "num_tokens": 1732899794.0, "step": 6620 }, { "epoch": 3.0885780885780885, "grad_norm": 0.34294237084456936, "learning_rate": 2.071096385902651e-05, "loss": 0.3466, "num_tokens": 1734195166.0, "step": 6625 }, { "epoch": 3.090909090909091, "grad_norm": 0.3654999256904382, "learning_rate": 2.067789808060911e-05, "loss": 0.361, "num_tokens": 1735505886.0, "step": 6630 }, { "epoch": 3.093240093240093, "grad_norm": 0.33471915902491023, "learning_rate": 2.064484851954579e-05, "loss": 0.3542, "num_tokens": 1736816606.0, "step": 6635 }, { "epoch": 3.0955710955710956, "grad_norm": 0.3378622718593044, "learning_rate": 2.061181525440124e-05, "loss": 0.3542, "num_tokens": 1738127326.0, "step": 6640 }, { "epoch": 3.097902097902098, "grad_norm": 0.3179265953787293, "learning_rate": 2.057879836370144e-05, "loss": 0.3444, "num_tokens": 1739438046.0, "step": 6645 }, { "epoch": 3.1002331002331003, "grad_norm": 0.3616349714374049, "learning_rate": 2.0545797925933437e-05, "loss": 0.3502, "num_tokens": 1740748766.0, "step": 6650 }, { "epoch": 3.1025641025641026, "grad_norm": 0.3304854559937638, "learning_rate": 2.0512814019545153e-05, "loss": 0.3549, "num_tokens": 1742059486.0, "step": 6655 }, { "epoch": 3.104895104895105, "grad_norm": 0.3524011066513125, "learning_rate": 2.047984672294521e-05, "loss": 0.3465, "num_tokens": 1743360293.0, "step": 6660 }, { "epoch": 3.1072261072261074, "grad_norm": 0.35458352089191, "learning_rate": 2.044689611450279e-05, "loss": 0.3549, "num_tokens": 1744671013.0, "step": 6665 }, { "epoch": 3.1095571095571097, "grad_norm": 0.3269683114844527, "learning_rate": 2.0413962272547343e-05, "loss": 0.3686, "num_tokens": 1745981733.0, "step": 6670 }, { "epoch": 3.111888111888112, "grad_norm": 0.33666988209851806, "learning_rate": 2.0381045275368504e-05, "loss": 0.3569, "num_tokens": 1747292453.0, "step": 6675 }, { "epoch": 3.114219114219114, "grad_norm": 0.36146342082712163, "learning_rate": 2.034814520121584e-05, "loss": 0.3628, "num_tokens": 1748603173.0, "step": 6680 }, { "epoch": 3.1165501165501164, "grad_norm": 0.3358286209657946, "learning_rate": 2.0315262128298713e-05, "loss": 0.3503, "num_tokens": 1749913893.0, "step": 6685 }, { "epoch": 3.1188811188811187, "grad_norm": 0.335084467637873, "learning_rate": 2.0282396134786052e-05, "loss": 0.3654, "num_tokens": 1751212459.0, "step": 6690 }, { "epoch": 3.121212121212121, "grad_norm": 0.3438646934966656, "learning_rate": 2.024954729880618e-05, "loss": 0.3603, "num_tokens": 1752511336.0, "step": 6695 }, { "epoch": 3.1235431235431235, "grad_norm": 0.3361069117028752, "learning_rate": 2.0216715698446665e-05, "loss": 0.359, "num_tokens": 1753807515.0, "step": 6700 }, { "epoch": 3.125874125874126, "grad_norm": 0.34476799845177647, "learning_rate": 2.0183901411754074e-05, "loss": 0.3559, "num_tokens": 1755118235.0, "step": 6705 }, { "epoch": 3.128205128205128, "grad_norm": 0.3307136432579508, "learning_rate": 2.01511045167338e-05, "loss": 0.3595, "num_tokens": 1756428955.0, "step": 6710 }, { "epoch": 3.1305361305361306, "grad_norm": 0.38351409943354237, "learning_rate": 2.011832509134996e-05, "loss": 0.3515, "num_tokens": 1757739675.0, "step": 6715 }, { "epoch": 3.132867132867133, "grad_norm": 0.3085970573121658, "learning_rate": 2.0085563213525065e-05, "loss": 0.3622, "num_tokens": 1759050395.0, "step": 6720 }, { "epoch": 3.1351981351981353, "grad_norm": 0.33740198770023655, "learning_rate": 2.005281896113997e-05, "loss": 0.3564, "num_tokens": 1760361115.0, "step": 6725 }, { "epoch": 3.1375291375291376, "grad_norm": 0.3727057443139077, "learning_rate": 2.0020092412033587e-05, "loss": 0.3651, "num_tokens": 1761671835.0, "step": 6730 }, { "epoch": 3.13986013986014, "grad_norm": 0.3237016735245595, "learning_rate": 1.9987383644002776e-05, "loss": 0.355, "num_tokens": 1762982555.0, "step": 6735 }, { "epoch": 3.1421911421911424, "grad_norm": 0.3357114616276399, "learning_rate": 1.995469273480212e-05, "loss": 0.3566, "num_tokens": 1764293275.0, "step": 6740 }, { "epoch": 3.1445221445221447, "grad_norm": 0.3438598090126279, "learning_rate": 1.9922019762143744e-05, "loss": 0.3583, "num_tokens": 1765603995.0, "step": 6745 }, { "epoch": 3.1468531468531467, "grad_norm": 0.3456073924056793, "learning_rate": 1.9889364803697137e-05, "loss": 0.3781, "num_tokens": 1766914715.0, "step": 6750 }, { "epoch": 3.149184149184149, "grad_norm": 0.33427973566883806, "learning_rate": 1.9856727937088955e-05, "loss": 0.3451, "num_tokens": 1768213787.0, "step": 6755 }, { "epoch": 3.1515151515151514, "grad_norm": 0.3420541495199111, "learning_rate": 1.9824109239902865e-05, "loss": 0.3705, "num_tokens": 1769519941.0, "step": 6760 }, { "epoch": 3.1538461538461537, "grad_norm": 0.3396999634927412, "learning_rate": 1.9791508789679337e-05, "loss": 0.3563, "num_tokens": 1770830661.0, "step": 6765 }, { "epoch": 3.156177156177156, "grad_norm": 0.3580402097640208, "learning_rate": 1.9758926663915455e-05, "loss": 0.3635, "num_tokens": 1772141381.0, "step": 6770 }, { "epoch": 3.1585081585081585, "grad_norm": 0.3349041898550379, "learning_rate": 1.9726362940064752e-05, "loss": 0.3514, "num_tokens": 1773452101.0, "step": 6775 }, { "epoch": 3.160839160839161, "grad_norm": 0.33160721758218376, "learning_rate": 1.9693817695537e-05, "loss": 0.3556, "num_tokens": 1774762821.0, "step": 6780 }, { "epoch": 3.163170163170163, "grad_norm": 0.34077147929405477, "learning_rate": 1.9661291007698062e-05, "loss": 0.3549, "num_tokens": 1776073541.0, "step": 6785 }, { "epoch": 3.1655011655011656, "grad_norm": 0.350707492092592, "learning_rate": 1.9628782953869696e-05, "loss": 0.3575, "num_tokens": 1777384261.0, "step": 6790 }, { "epoch": 3.167832167832168, "grad_norm": 0.32637137207920947, "learning_rate": 1.959629361132932e-05, "loss": 0.3487, "num_tokens": 1778681608.0, "step": 6795 }, { "epoch": 3.1701631701631703, "grad_norm": 0.34823978144338197, "learning_rate": 1.956382305730993e-05, "loss": 0.3579, "num_tokens": 1779992328.0, "step": 6800 }, { "epoch": 3.1724941724941726, "grad_norm": 0.3231155957000068, "learning_rate": 1.953137136899982e-05, "loss": 0.3485, "num_tokens": 1781303048.0, "step": 6805 }, { "epoch": 3.174825174825175, "grad_norm": 0.32946568056880193, "learning_rate": 1.9498938623542418e-05, "loss": 0.3536, "num_tokens": 1782611783.0, "step": 6810 }, { "epoch": 3.177156177156177, "grad_norm": 0.33586077368245165, "learning_rate": 1.94665248980362e-05, "loss": 0.3476, "num_tokens": 1783922503.0, "step": 6815 }, { "epoch": 3.1794871794871793, "grad_norm": 0.345953994294241, "learning_rate": 1.943413026953434e-05, "loss": 0.3567, "num_tokens": 1785233223.0, "step": 6820 }, { "epoch": 3.1818181818181817, "grad_norm": 0.3350392695147026, "learning_rate": 1.9401754815044665e-05, "loss": 0.368, "num_tokens": 1786536776.0, "step": 6825 }, { "epoch": 3.184149184149184, "grad_norm": 0.34554062524489243, "learning_rate": 1.9369398611529405e-05, "loss": 0.3589, "num_tokens": 1787847496.0, "step": 6830 }, { "epoch": 3.1864801864801864, "grad_norm": 0.3554652710783725, "learning_rate": 1.9337061735905038e-05, "loss": 0.3516, "num_tokens": 1789158216.0, "step": 6835 }, { "epoch": 3.1888111888111887, "grad_norm": 0.3893789449502026, "learning_rate": 1.930474426504209e-05, "loss": 0.3631, "num_tokens": 1790468936.0, "step": 6840 }, { "epoch": 3.191142191142191, "grad_norm": 0.3556543352743324, "learning_rate": 1.9272446275764954e-05, "loss": 0.3733, "num_tokens": 1791779656.0, "step": 6845 }, { "epoch": 3.1934731934731935, "grad_norm": 0.35483552905034643, "learning_rate": 1.924016784485172e-05, "loss": 0.3609, "num_tokens": 1793090376.0, "step": 6850 }, { "epoch": 3.195804195804196, "grad_norm": 0.33949854975550836, "learning_rate": 1.9207909049033972e-05, "loss": 0.3584, "num_tokens": 1794401096.0, "step": 6855 }, { "epoch": 3.198135198135198, "grad_norm": 0.33640749306479956, "learning_rate": 1.9175669964996636e-05, "loss": 0.3633, "num_tokens": 1795711816.0, "step": 6860 }, { "epoch": 3.2004662004662006, "grad_norm": 0.3409998399187101, "learning_rate": 1.9143450669377762e-05, "loss": 0.3634, "num_tokens": 1797022536.0, "step": 6865 }, { "epoch": 3.202797202797203, "grad_norm": 0.3275911678319333, "learning_rate": 1.9111251238768373e-05, "loss": 0.3487, "num_tokens": 1798317072.0, "step": 6870 }, { "epoch": 3.2051282051282053, "grad_norm": 0.3348585258935862, "learning_rate": 1.9079071749712262e-05, "loss": 0.354, "num_tokens": 1799619860.0, "step": 6875 }, { "epoch": 3.2074592074592077, "grad_norm": 0.34340870474918594, "learning_rate": 1.9046912278705815e-05, "loss": 0.363, "num_tokens": 1800930580.0, "step": 6880 }, { "epoch": 3.20979020979021, "grad_norm": 0.3407672323919426, "learning_rate": 1.901477290219784e-05, "loss": 0.3573, "num_tokens": 1802235866.0, "step": 6885 }, { "epoch": 3.212121212121212, "grad_norm": 0.3586302213819541, "learning_rate": 1.898265369658938e-05, "loss": 0.3595, "num_tokens": 1803546586.0, "step": 6890 }, { "epoch": 3.2144522144522143, "grad_norm": 0.3352168444831923, "learning_rate": 1.8950554738233495e-05, "loss": 0.3547, "num_tokens": 1804840952.0, "step": 6895 }, { "epoch": 3.2167832167832167, "grad_norm": 0.33771864707994703, "learning_rate": 1.8918476103435174e-05, "loss": 0.3581, "num_tokens": 1806151672.0, "step": 6900 }, { "epoch": 3.219114219114219, "grad_norm": 0.32791228261887584, "learning_rate": 1.888641786845102e-05, "loss": 0.3475, "num_tokens": 1807462392.0, "step": 6905 }, { "epoch": 3.2214452214452214, "grad_norm": 0.3475449829583515, "learning_rate": 1.8854380109489206e-05, "loss": 0.3597, "num_tokens": 1808759709.0, "step": 6910 }, { "epoch": 3.2237762237762237, "grad_norm": 0.349286003745334, "learning_rate": 1.88223629027092e-05, "loss": 0.3702, "num_tokens": 1810057084.0, "step": 6915 }, { "epoch": 3.226107226107226, "grad_norm": 0.34279824899570205, "learning_rate": 1.8790366324221616e-05, "loss": 0.3572, "num_tokens": 1811367804.0, "step": 6920 }, { "epoch": 3.2284382284382285, "grad_norm": 0.3270543901207085, "learning_rate": 1.8758390450088025e-05, "loss": 0.3581, "num_tokens": 1812678524.0, "step": 6925 }, { "epoch": 3.230769230769231, "grad_norm": 0.3719710378237588, "learning_rate": 1.8726435356320804e-05, "loss": 0.3503, "num_tokens": 1813989244.0, "step": 6930 }, { "epoch": 3.233100233100233, "grad_norm": 0.34389192152825504, "learning_rate": 1.8694501118882902e-05, "loss": 0.3677, "num_tokens": 1815299964.0, "step": 6935 }, { "epoch": 3.2354312354312356, "grad_norm": 0.3206403097527311, "learning_rate": 1.8662587813687704e-05, "loss": 0.3698, "num_tokens": 1816610684.0, "step": 6940 }, { "epoch": 3.237762237762238, "grad_norm": 0.3333938729526916, "learning_rate": 1.8630695516598832e-05, "loss": 0.3517, "num_tokens": 1817921404.0, "step": 6945 }, { "epoch": 3.2400932400932403, "grad_norm": 0.3453920815805985, "learning_rate": 1.8598824303429985e-05, "loss": 0.3608, "num_tokens": 1819232124.0, "step": 6950 }, { "epoch": 3.242424242424242, "grad_norm": 0.34058628970443383, "learning_rate": 1.8566974249944707e-05, "loss": 0.356, "num_tokens": 1820534563.0, "step": 6955 }, { "epoch": 3.2447552447552446, "grad_norm": 0.3288432699435565, "learning_rate": 1.8535145431856266e-05, "loss": 0.3554, "num_tokens": 1821845283.0, "step": 6960 }, { "epoch": 3.247086247086247, "grad_norm": 0.35168833123437004, "learning_rate": 1.8503337924827446e-05, "loss": 0.3537, "num_tokens": 1823156003.0, "step": 6965 }, { "epoch": 3.2494172494172493, "grad_norm": 0.3392709841202291, "learning_rate": 1.8471551804470372e-05, "loss": 0.3557, "num_tokens": 1824466723.0, "step": 6970 }, { "epoch": 3.2517482517482517, "grad_norm": 0.3426042646396951, "learning_rate": 1.8439787146346314e-05, "loss": 0.3532, "num_tokens": 1825777443.0, "step": 6975 }, { "epoch": 3.254079254079254, "grad_norm": 0.3454602775337345, "learning_rate": 1.8408044025965555e-05, "loss": 0.3484, "num_tokens": 1827081333.0, "step": 6980 }, { "epoch": 3.2564102564102564, "grad_norm": 0.36506433832795027, "learning_rate": 1.8376322518787144e-05, "loss": 0.3621, "num_tokens": 1828392053.0, "step": 6985 }, { "epoch": 3.2587412587412588, "grad_norm": 0.36524454212892954, "learning_rate": 1.8344622700218774e-05, "loss": 0.3632, "num_tokens": 1829702773.0, "step": 6990 }, { "epoch": 3.261072261072261, "grad_norm": 0.3635612315820556, "learning_rate": 1.831294464561655e-05, "loss": 0.3577, "num_tokens": 1831013493.0, "step": 6995 }, { "epoch": 3.2634032634032635, "grad_norm": 0.3439969893709455, "learning_rate": 1.8281288430284898e-05, "loss": 0.3587, "num_tokens": 1832324213.0, "step": 7000 }, { "epoch": 3.265734265734266, "grad_norm": 0.3664905114985399, "learning_rate": 1.8249654129476267e-05, "loss": 0.3643, "num_tokens": 1833634933.0, "step": 7005 }, { "epoch": 3.268065268065268, "grad_norm": 0.3415502009175583, "learning_rate": 1.8218041818391046e-05, "loss": 0.3627, "num_tokens": 1834945653.0, "step": 7010 }, { "epoch": 3.2703962703962706, "grad_norm": 0.3416327517246984, "learning_rate": 1.8186451572177348e-05, "loss": 0.3581, "num_tokens": 1836243452.0, "step": 7015 }, { "epoch": 3.2727272727272725, "grad_norm": 0.34814377337071994, "learning_rate": 1.8154883465930816e-05, "loss": 0.3629, "num_tokens": 1837547262.0, "step": 7020 }, { "epoch": 3.2750582750582753, "grad_norm": 0.34272113854001507, "learning_rate": 1.812333757469447e-05, "loss": 0.3489, "num_tokens": 1838857982.0, "step": 7025 }, { "epoch": 3.277389277389277, "grad_norm": 0.35857308092533485, "learning_rate": 1.8091813973458538e-05, "loss": 0.3756, "num_tokens": 1840156853.0, "step": 7030 }, { "epoch": 3.2797202797202796, "grad_norm": 0.32955350777081055, "learning_rate": 1.806031273716025e-05, "loss": 0.3706, "num_tokens": 1841467573.0, "step": 7035 }, { "epoch": 3.282051282051282, "grad_norm": 0.3456971274249648, "learning_rate": 1.802883394068366e-05, "loss": 0.3567, "num_tokens": 1842778293.0, "step": 7040 }, { "epoch": 3.2843822843822843, "grad_norm": 0.3288793199678882, "learning_rate": 1.7997377658859464e-05, "loss": 0.3604, "num_tokens": 1844089013.0, "step": 7045 }, { "epoch": 3.2867132867132867, "grad_norm": 0.3251910644151693, "learning_rate": 1.796594396646491e-05, "loss": 0.3573, "num_tokens": 1845399733.0, "step": 7050 }, { "epoch": 3.289044289044289, "grad_norm": 0.32298843530806964, "learning_rate": 1.7934532938223457e-05, "loss": 0.368, "num_tokens": 1846710453.0, "step": 7055 }, { "epoch": 3.2913752913752914, "grad_norm": 0.3390547972284236, "learning_rate": 1.7903144648804725e-05, "loss": 0.3488, "num_tokens": 1848021173.0, "step": 7060 }, { "epoch": 3.2937062937062938, "grad_norm": 0.31322899510921975, "learning_rate": 1.7871779172824316e-05, "loss": 0.3567, "num_tokens": 1849331893.0, "step": 7065 }, { "epoch": 3.296037296037296, "grad_norm": 0.3248889681783109, "learning_rate": 1.7840436584843536e-05, "loss": 0.3569, "num_tokens": 1850642613.0, "step": 7070 }, { "epoch": 3.2983682983682985, "grad_norm": 0.3205616691475068, "learning_rate": 1.780911695936931e-05, "loss": 0.3479, "num_tokens": 1851953333.0, "step": 7075 }, { "epoch": 3.300699300699301, "grad_norm": 0.3382664447246441, "learning_rate": 1.7777820370853988e-05, "loss": 0.3602, "num_tokens": 1853264053.0, "step": 7080 }, { "epoch": 3.303030303030303, "grad_norm": 0.3535879556837902, "learning_rate": 1.7746546893695148e-05, "loss": 0.354, "num_tokens": 1854574773.0, "step": 7085 }, { "epoch": 3.3053613053613056, "grad_norm": 0.3688428341220924, "learning_rate": 1.7715296602235427e-05, "loss": 0.3568, "num_tokens": 1855885493.0, "step": 7090 }, { "epoch": 3.3076923076923075, "grad_norm": 0.34122421753248505, "learning_rate": 1.768406957076234e-05, "loss": 0.3659, "num_tokens": 1857196213.0, "step": 7095 }, { "epoch": 3.31002331002331, "grad_norm": 0.3474475995666852, "learning_rate": 1.7652865873508134e-05, "loss": 0.3658, "num_tokens": 1858506933.0, "step": 7100 }, { "epoch": 3.312354312354312, "grad_norm": 0.3120215313160503, "learning_rate": 1.7621685584649543e-05, "loss": 0.3636, "num_tokens": 1859817653.0, "step": 7105 }, { "epoch": 3.3146853146853146, "grad_norm": 0.34100285389129825, "learning_rate": 1.7590528778307693e-05, "loss": 0.3575, "num_tokens": 1861128373.0, "step": 7110 }, { "epoch": 3.317016317016317, "grad_norm": 0.3401139043980884, "learning_rate": 1.7559395528547874e-05, "loss": 0.3716, "num_tokens": 1862423586.0, "step": 7115 }, { "epoch": 3.3193473193473193, "grad_norm": 0.35077446828202064, "learning_rate": 1.752828590937938e-05, "loss": 0.3715, "num_tokens": 1863734306.0, "step": 7120 }, { "epoch": 3.3216783216783217, "grad_norm": 0.3890151970682224, "learning_rate": 1.7497199994755313e-05, "loss": 0.3625, "num_tokens": 1865045026.0, "step": 7125 }, { "epoch": 3.324009324009324, "grad_norm": 0.3357664208904343, "learning_rate": 1.7466137858572467e-05, "loss": 0.3565, "num_tokens": 1866349041.0, "step": 7130 }, { "epoch": 3.3263403263403264, "grad_norm": 0.3489477894369833, "learning_rate": 1.743509957467107e-05, "loss": 0.3615, "num_tokens": 1867659761.0, "step": 7135 }, { "epoch": 3.3286713286713288, "grad_norm": 0.32828301621058453, "learning_rate": 1.740408521683465e-05, "loss": 0.3456, "num_tokens": 1868970481.0, "step": 7140 }, { "epoch": 3.331002331002331, "grad_norm": 0.3585651282189203, "learning_rate": 1.7373094858789905e-05, "loss": 0.366, "num_tokens": 1870281201.0, "step": 7145 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3665074740674538, "learning_rate": 1.7342128574206428e-05, "loss": 0.3575, "num_tokens": 1871582933.0, "step": 7150 }, { "epoch": 3.335664335664336, "grad_norm": 0.3250266204452436, "learning_rate": 1.7311186436696597e-05, "loss": 0.3478, "num_tokens": 1872886587.0, "step": 7155 }, { "epoch": 3.3379953379953378, "grad_norm": 0.3224181885863933, "learning_rate": 1.7280268519815413e-05, "loss": 0.349, "num_tokens": 1874197307.0, "step": 7160 }, { "epoch": 3.3403263403263406, "grad_norm": 0.3388758481335769, "learning_rate": 1.7249374897060282e-05, "loss": 0.3583, "num_tokens": 1875508027.0, "step": 7165 }, { "epoch": 3.3426573426573425, "grad_norm": 0.35014080427325917, "learning_rate": 1.7218505641870846e-05, "loss": 0.3644, "num_tokens": 1876818747.0, "step": 7170 }, { "epoch": 3.344988344988345, "grad_norm": 0.34625439089102006, "learning_rate": 1.7187660827628844e-05, "loss": 0.3544, "num_tokens": 1878129467.0, "step": 7175 }, { "epoch": 3.347319347319347, "grad_norm": 0.3322054073312721, "learning_rate": 1.7156840527657915e-05, "loss": 0.3507, "num_tokens": 1879421830.0, "step": 7180 }, { "epoch": 3.3496503496503496, "grad_norm": 0.31612604477115036, "learning_rate": 1.712604481522339e-05, "loss": 0.3527, "num_tokens": 1880732550.0, "step": 7185 }, { "epoch": 3.351981351981352, "grad_norm": 0.32854072716891647, "learning_rate": 1.70952737635322e-05, "loss": 0.3654, "num_tokens": 1882043270.0, "step": 7190 }, { "epoch": 3.3543123543123543, "grad_norm": 0.3377153380244728, "learning_rate": 1.706452744573262e-05, "loss": 0.3639, "num_tokens": 1883353990.0, "step": 7195 }, { "epoch": 3.3566433566433567, "grad_norm": 0.3368928740025494, "learning_rate": 1.7033805934914126e-05, "loss": 0.3615, "num_tokens": 1884664710.0, "step": 7200 }, { "epoch": 3.358974358974359, "grad_norm": 0.3423816860220159, "learning_rate": 1.7003109304107245e-05, "loss": 0.3521, "num_tokens": 1885975430.0, "step": 7205 }, { "epoch": 3.3613053613053614, "grad_norm": 0.31997739521276675, "learning_rate": 1.697243762628334e-05, "loss": 0.3576, "num_tokens": 1887286150.0, "step": 7210 }, { "epoch": 3.3636363636363638, "grad_norm": 0.31310634183618563, "learning_rate": 1.6941790974354464e-05, "loss": 0.3578, "num_tokens": 1888596870.0, "step": 7215 }, { "epoch": 3.365967365967366, "grad_norm": 0.32276140427338623, "learning_rate": 1.6911169421173194e-05, "loss": 0.3628, "num_tokens": 1889907590.0, "step": 7220 }, { "epoch": 3.3682983682983685, "grad_norm": 0.32607047003096234, "learning_rate": 1.688057303953241e-05, "loss": 0.3642, "num_tokens": 1891218310.0, "step": 7225 }, { "epoch": 3.370629370629371, "grad_norm": 0.34484157121118036, "learning_rate": 1.6850001902165176e-05, "loss": 0.3467, "num_tokens": 1892529030.0, "step": 7230 }, { "epoch": 3.3729603729603728, "grad_norm": 0.3407500682459095, "learning_rate": 1.6819456081744558e-05, "loss": 0.355, "num_tokens": 1893829272.0, "step": 7235 }, { "epoch": 3.375291375291375, "grad_norm": 0.34265345627298444, "learning_rate": 1.6788935650883407e-05, "loss": 0.3559, "num_tokens": 1895139992.0, "step": 7240 }, { "epoch": 3.3776223776223775, "grad_norm": 0.3256176164919399, "learning_rate": 1.6758440682134235e-05, "loss": 0.3537, "num_tokens": 1896450712.0, "step": 7245 }, { "epoch": 3.37995337995338, "grad_norm": 0.34858433343769213, "learning_rate": 1.6727971247989045e-05, "loss": 0.3573, "num_tokens": 1897749047.0, "step": 7250 }, { "epoch": 3.382284382284382, "grad_norm": 0.35644873419658263, "learning_rate": 1.669752742087911e-05, "loss": 0.356, "num_tokens": 1899059767.0, "step": 7255 }, { "epoch": 3.3846153846153846, "grad_norm": 0.3339145275176127, "learning_rate": 1.6667109273174823e-05, "loss": 0.3562, "num_tokens": 1900370487.0, "step": 7260 }, { "epoch": 3.386946386946387, "grad_norm": 0.3479027207281923, "learning_rate": 1.6636716877185575e-05, "loss": 0.3515, "num_tokens": 1901681207.0, "step": 7265 }, { "epoch": 3.3892773892773893, "grad_norm": 0.3373951962701607, "learning_rate": 1.660635030515952e-05, "loss": 0.3524, "num_tokens": 1902991927.0, "step": 7270 }, { "epoch": 3.3916083916083917, "grad_norm": 0.3252740139876212, "learning_rate": 1.6576009629283402e-05, "loss": 0.3585, "num_tokens": 1904302647.0, "step": 7275 }, { "epoch": 3.393939393939394, "grad_norm": 0.32879290977391473, "learning_rate": 1.654569492168243e-05, "loss": 0.3588, "num_tokens": 1905613367.0, "step": 7280 }, { "epoch": 3.3962703962703964, "grad_norm": 0.34714056963970763, "learning_rate": 1.6515406254420085e-05, "loss": 0.3614, "num_tokens": 1906924087.0, "step": 7285 }, { "epoch": 3.3986013986013988, "grad_norm": 0.32182206400603064, "learning_rate": 1.6485143699497917e-05, "loss": 0.3732, "num_tokens": 1908234807.0, "step": 7290 }, { "epoch": 3.400932400932401, "grad_norm": 0.3521858097420374, "learning_rate": 1.6454907328855436e-05, "loss": 0.3601, "num_tokens": 1909539681.0, "step": 7295 }, { "epoch": 3.403263403263403, "grad_norm": 0.3222892499746984, "learning_rate": 1.6424697214369894e-05, "loss": 0.3548, "num_tokens": 1910850401.0, "step": 7300 }, { "epoch": 3.4055944055944054, "grad_norm": 0.3361283433082892, "learning_rate": 1.6394513427856117e-05, "loss": 0.3627, "num_tokens": 1912161121.0, "step": 7305 }, { "epoch": 3.4079254079254078, "grad_norm": 0.34204868238681374, "learning_rate": 1.6364356041066355e-05, "loss": 0.3577, "num_tokens": 1913471841.0, "step": 7310 }, { "epoch": 3.41025641025641, "grad_norm": 0.3302716919114987, "learning_rate": 1.633422512569011e-05, "loss": 0.367, "num_tokens": 1914782561.0, "step": 7315 }, { "epoch": 3.4125874125874125, "grad_norm": 0.3333885988826235, "learning_rate": 1.630412075335393e-05, "loss": 0.3626, "num_tokens": 1916093281.0, "step": 7320 }, { "epoch": 3.414918414918415, "grad_norm": 0.34793721986248155, "learning_rate": 1.627404299562129e-05, "loss": 0.3613, "num_tokens": 1917404001.0, "step": 7325 }, { "epoch": 3.417249417249417, "grad_norm": 0.3338489940605626, "learning_rate": 1.6243991923992404e-05, "loss": 0.3577, "num_tokens": 1918698383.0, "step": 7330 }, { "epoch": 3.4195804195804196, "grad_norm": 0.33919384025686283, "learning_rate": 1.6213967609904014e-05, "loss": 0.3688, "num_tokens": 1920009103.0, "step": 7335 }, { "epoch": 3.421911421911422, "grad_norm": 0.32694232562377856, "learning_rate": 1.6183970124729268e-05, "loss": 0.3559, "num_tokens": 1921319823.0, "step": 7340 }, { "epoch": 3.4242424242424243, "grad_norm": 0.33584818314646175, "learning_rate": 1.615399953977757e-05, "loss": 0.3589, "num_tokens": 1922630543.0, "step": 7345 }, { "epoch": 3.4265734265734267, "grad_norm": 0.31756474094191617, "learning_rate": 1.612405592629433e-05, "loss": 0.3509, "num_tokens": 1923941263.0, "step": 7350 }, { "epoch": 3.428904428904429, "grad_norm": 0.3256749853351091, "learning_rate": 1.6094139355460855e-05, "loss": 0.3589, "num_tokens": 1925251983.0, "step": 7355 }, { "epoch": 3.4312354312354314, "grad_norm": 0.3440498427386265, "learning_rate": 1.6064249898394205e-05, "loss": 0.366, "num_tokens": 1926562703.0, "step": 7360 }, { "epoch": 3.4335664335664333, "grad_norm": 0.31653310321422085, "learning_rate": 1.6034387626146936e-05, "loss": 0.3644, "num_tokens": 1927873423.0, "step": 7365 }, { "epoch": 3.435897435897436, "grad_norm": 0.316076483824156, "learning_rate": 1.6004552609706992e-05, "loss": 0.3512, "num_tokens": 1929184143.0, "step": 7370 }, { "epoch": 3.438228438228438, "grad_norm": 0.32507348992734375, "learning_rate": 1.5974744919997543e-05, "loss": 0.3498, "num_tokens": 1930494863.0, "step": 7375 }, { "epoch": 3.4405594405594404, "grad_norm": 0.3302545729067321, "learning_rate": 1.5944964627876795e-05, "loss": 0.3665, "num_tokens": 1931792174.0, "step": 7380 }, { "epoch": 3.4428904428904428, "grad_norm": 0.32458383971347465, "learning_rate": 1.5915211804137803e-05, "loss": 0.3633, "num_tokens": 1933102894.0, "step": 7385 }, { "epoch": 3.445221445221445, "grad_norm": 0.34226951871442984, "learning_rate": 1.5885486519508347e-05, "loss": 0.3595, "num_tokens": 1934413614.0, "step": 7390 }, { "epoch": 3.4475524475524475, "grad_norm": 0.34897664385155186, "learning_rate": 1.5855788844650744e-05, "loss": 0.3594, "num_tokens": 1935724334.0, "step": 7395 }, { "epoch": 3.44988344988345, "grad_norm": 0.3366078374615375, "learning_rate": 1.5826118850161653e-05, "loss": 0.3551, "num_tokens": 1937035054.0, "step": 7400 }, { "epoch": 3.4522144522144522, "grad_norm": 0.36277795416572367, "learning_rate": 1.5796476606571957e-05, "loss": 0.3704, "num_tokens": 1938345774.0, "step": 7405 }, { "epoch": 3.4545454545454546, "grad_norm": 0.34336029415281094, "learning_rate": 1.576686218434656e-05, "loss": 0.3537, "num_tokens": 1939646875.0, "step": 7410 }, { "epoch": 3.456876456876457, "grad_norm": 0.3393838080049611, "learning_rate": 1.5737275653884225e-05, "loss": 0.3724, "num_tokens": 1940957595.0, "step": 7415 }, { "epoch": 3.4592074592074593, "grad_norm": 0.34552607264965096, "learning_rate": 1.5707717085517427e-05, "loss": 0.3535, "num_tokens": 1942268315.0, "step": 7420 }, { "epoch": 3.4615384615384617, "grad_norm": 0.3161448905098042, "learning_rate": 1.567818654951214e-05, "loss": 0.3551, "num_tokens": 1943579035.0, "step": 7425 }, { "epoch": 3.463869463869464, "grad_norm": 0.33578110841595465, "learning_rate": 1.5648684116067737e-05, "loss": 0.3737, "num_tokens": 1944889755.0, "step": 7430 }, { "epoch": 3.4662004662004664, "grad_norm": 0.3547622429809544, "learning_rate": 1.5619209855316766e-05, "loss": 0.3628, "num_tokens": 1946184922.0, "step": 7435 }, { "epoch": 3.4685314685314683, "grad_norm": 0.3257055842704097, "learning_rate": 1.5589763837324794e-05, "loss": 0.3557, "num_tokens": 1947495642.0, "step": 7440 }, { "epoch": 3.4708624708624707, "grad_norm": 0.3259822417139684, "learning_rate": 1.5560346132090275e-05, "loss": 0.3544, "num_tokens": 1948806362.0, "step": 7445 }, { "epoch": 3.473193473193473, "grad_norm": 0.34231123217132675, "learning_rate": 1.5530956809544354e-05, "loss": 0.3609, "num_tokens": 1950117082.0, "step": 7450 }, { "epoch": 3.4755244755244754, "grad_norm": 0.32649483544948477, "learning_rate": 1.5501595939550674e-05, "loss": 0.352, "num_tokens": 1951427802.0, "step": 7455 }, { "epoch": 3.4778554778554778, "grad_norm": 0.3538064723620207, "learning_rate": 1.547226359190528e-05, "loss": 0.3601, "num_tokens": 1952738522.0, "step": 7460 }, { "epoch": 3.48018648018648, "grad_norm": 0.34134701799353756, "learning_rate": 1.544295983633639e-05, "loss": 0.3543, "num_tokens": 1954049242.0, "step": 7465 }, { "epoch": 3.4825174825174825, "grad_norm": 0.3050547799836289, "learning_rate": 1.5413684742504275e-05, "loss": 0.3426, "num_tokens": 1955359962.0, "step": 7470 }, { "epoch": 3.484848484848485, "grad_norm": 0.33289101107690966, "learning_rate": 1.538443838000104e-05, "loss": 0.3555, "num_tokens": 1956670682.0, "step": 7475 }, { "epoch": 3.4871794871794872, "grad_norm": 0.3248102594895154, "learning_rate": 1.5355220818350517e-05, "loss": 0.3664, "num_tokens": 1957981402.0, "step": 7480 }, { "epoch": 3.4895104895104896, "grad_norm": 0.32624207166938474, "learning_rate": 1.5326032127008077e-05, "loss": 0.3627, "num_tokens": 1959292122.0, "step": 7485 }, { "epoch": 3.491841491841492, "grad_norm": 0.32138847441298957, "learning_rate": 1.5296872375360434e-05, "loss": 0.3596, "num_tokens": 1960602842.0, "step": 7490 }, { "epoch": 3.4941724941724943, "grad_norm": 0.3329259719385868, "learning_rate": 1.526774163272553e-05, "loss": 0.3713, "num_tokens": 1961913562.0, "step": 7495 }, { "epoch": 3.4965034965034967, "grad_norm": 0.3270763048933, "learning_rate": 1.5238639968352346e-05, "loss": 0.3605, "num_tokens": 1963224282.0, "step": 7500 }, { "epoch": 3.4988344988344986, "grad_norm": 0.33222559214722885, "learning_rate": 1.520956745142072e-05, "loss": 0.3557, "num_tokens": 1964535002.0, "step": 7505 }, { "epoch": 3.5011655011655014, "grad_norm": 0.31280758663110264, "learning_rate": 1.518052415104122e-05, "loss": 0.3525, "num_tokens": 1965845722.0, "step": 7510 }, { "epoch": 3.5034965034965033, "grad_norm": 0.3474298912782969, "learning_rate": 1.5151510136254971e-05, "loss": 0.3762, "num_tokens": 1967156442.0, "step": 7515 }, { "epoch": 3.5058275058275057, "grad_norm": 0.35950009162167107, "learning_rate": 1.5122525476033448e-05, "loss": 0.3629, "num_tokens": 1968467162.0, "step": 7520 }, { "epoch": 3.508158508158508, "grad_norm": 0.32565859243171547, "learning_rate": 1.5093570239278348e-05, "loss": 0.3567, "num_tokens": 1969777882.0, "step": 7525 }, { "epoch": 3.5104895104895104, "grad_norm": 0.34433837280720203, "learning_rate": 1.5064644494821472e-05, "loss": 0.3578, "num_tokens": 1971088602.0, "step": 7530 }, { "epoch": 3.5128205128205128, "grad_norm": 0.32951210301147243, "learning_rate": 1.503574831142446e-05, "loss": 0.3564, "num_tokens": 1972384019.0, "step": 7535 }, { "epoch": 3.515151515151515, "grad_norm": 0.3256591839058464, "learning_rate": 1.5006881757778687e-05, "loss": 0.3592, "num_tokens": 1973681803.0, "step": 7540 }, { "epoch": 3.5174825174825175, "grad_norm": 0.3271419894397293, "learning_rate": 1.4978044902505133e-05, "loss": 0.3569, "num_tokens": 1974992523.0, "step": 7545 }, { "epoch": 3.51981351981352, "grad_norm": 0.34506187438893227, "learning_rate": 1.4949237814154132e-05, "loss": 0.3484, "num_tokens": 1976296304.0, "step": 7550 }, { "epoch": 3.5221445221445222, "grad_norm": 0.34392115950662083, "learning_rate": 1.4920460561205263e-05, "loss": 0.3605, "num_tokens": 1977607024.0, "step": 7555 }, { "epoch": 3.5244755244755246, "grad_norm": 0.31870592682573046, "learning_rate": 1.4891713212067223e-05, "loss": 0.3539, "num_tokens": 1978917744.0, "step": 7560 }, { "epoch": 3.526806526806527, "grad_norm": 0.3369778884605504, "learning_rate": 1.4862995835077582e-05, "loss": 0.3616, "num_tokens": 1980228464.0, "step": 7565 }, { "epoch": 3.529137529137529, "grad_norm": 0.3181421663920499, "learning_rate": 1.4834308498502652e-05, "loss": 0.3586, "num_tokens": 1981539184.0, "step": 7570 }, { "epoch": 3.5314685314685317, "grad_norm": 0.3574276899464395, "learning_rate": 1.480565127053737e-05, "loss": 0.3432, "num_tokens": 1982849904.0, "step": 7575 }, { "epoch": 3.5337995337995336, "grad_norm": 0.35575215961743184, "learning_rate": 1.4777024219305092e-05, "loss": 0.3638, "num_tokens": 1984160624.0, "step": 7580 }, { "epoch": 3.5361305361305364, "grad_norm": 0.3463286942931235, "learning_rate": 1.4748427412857407e-05, "loss": 0.3687, "num_tokens": 1985471344.0, "step": 7585 }, { "epoch": 3.5384615384615383, "grad_norm": 0.3249769467333938, "learning_rate": 1.4719860919174039e-05, "loss": 0.3618, "num_tokens": 1986782064.0, "step": 7590 }, { "epoch": 3.5407925407925407, "grad_norm": 0.3489092698701824, "learning_rate": 1.469132480616265e-05, "loss": 0.3592, "num_tokens": 1988092784.0, "step": 7595 }, { "epoch": 3.543123543123543, "grad_norm": 0.33206107999422213, "learning_rate": 1.4662819141658662e-05, "loss": 0.3435, "num_tokens": 1989403504.0, "step": 7600 }, { "epoch": 3.5454545454545454, "grad_norm": 0.32926989102798976, "learning_rate": 1.4634343993425132e-05, "loss": 0.3598, "num_tokens": 1990714224.0, "step": 7605 }, { "epoch": 3.5477855477855478, "grad_norm": 0.3312078503555784, "learning_rate": 1.4605899429152581e-05, "loss": 0.366, "num_tokens": 1992024944.0, "step": 7610 }, { "epoch": 3.55011655011655, "grad_norm": 0.32101930069810486, "learning_rate": 1.45774855164588e-05, "loss": 0.3786, "num_tokens": 1993335664.0, "step": 7615 }, { "epoch": 3.5524475524475525, "grad_norm": 0.3270442973643097, "learning_rate": 1.4549102322888739e-05, "loss": 0.3522, "num_tokens": 1994646384.0, "step": 7620 }, { "epoch": 3.554778554778555, "grad_norm": 0.3410847452328196, "learning_rate": 1.452074991591432e-05, "loss": 0.3661, "num_tokens": 1995957104.0, "step": 7625 }, { "epoch": 3.5571095571095572, "grad_norm": 0.3311841562429209, "learning_rate": 1.4492428362934269e-05, "loss": 0.3644, "num_tokens": 1997267824.0, "step": 7630 }, { "epoch": 3.5594405594405596, "grad_norm": 0.34891575680597303, "learning_rate": 1.4464137731273974e-05, "loss": 0.3659, "num_tokens": 1998564615.0, "step": 7635 }, { "epoch": 3.561771561771562, "grad_norm": 0.3270565111830101, "learning_rate": 1.4435878088185317e-05, "loss": 0.3588, "num_tokens": 1999861934.0, "step": 7640 }, { "epoch": 3.564102564102564, "grad_norm": 0.3566229262833042, "learning_rate": 1.440764950084652e-05, "loss": 0.3651, "num_tokens": 2001172654.0, "step": 7645 }, { "epoch": 3.5664335664335667, "grad_norm": 0.34737324488773064, "learning_rate": 1.4379452036361963e-05, "loss": 0.3685, "num_tokens": 2002483374.0, "step": 7650 }, { "epoch": 3.5687645687645686, "grad_norm": 0.3479603277552613, "learning_rate": 1.4351285761762057e-05, "loss": 0.3603, "num_tokens": 2003794094.0, "step": 7655 }, { "epoch": 3.571095571095571, "grad_norm": 0.320114547771287, "learning_rate": 1.4323150744003075e-05, "loss": 0.3594, "num_tokens": 2005104814.0, "step": 7660 }, { "epoch": 3.5734265734265733, "grad_norm": 0.32862256230307296, "learning_rate": 1.4295047049966958e-05, "loss": 0.3605, "num_tokens": 2006415534.0, "step": 7665 }, { "epoch": 3.5757575757575757, "grad_norm": 0.35139510455853196, "learning_rate": 1.4266974746461217e-05, "loss": 0.3586, "num_tokens": 2007726254.0, "step": 7670 }, { "epoch": 3.578088578088578, "grad_norm": 0.33176820642996774, "learning_rate": 1.4238933900218731e-05, "loss": 0.3515, "num_tokens": 2009033953.0, "step": 7675 }, { "epoch": 3.5804195804195804, "grad_norm": 0.36171642550182254, "learning_rate": 1.4210924577897583e-05, "loss": 0.3604, "num_tokens": 2010344673.0, "step": 7680 }, { "epoch": 3.582750582750583, "grad_norm": 0.3374934342841691, "learning_rate": 1.4182946846080952e-05, "loss": 0.3545, "num_tokens": 2011655393.0, "step": 7685 }, { "epoch": 3.585081585081585, "grad_norm": 0.32415522876729075, "learning_rate": 1.4155000771276878e-05, "loss": 0.3434, "num_tokens": 2012966113.0, "step": 7690 }, { "epoch": 3.5874125874125875, "grad_norm": 0.3552935558976595, "learning_rate": 1.4127086419918178e-05, "loss": 0.366, "num_tokens": 2014276833.0, "step": 7695 }, { "epoch": 3.58974358974359, "grad_norm": 0.3306470072028632, "learning_rate": 1.4099203858362262e-05, "loss": 0.3608, "num_tokens": 2015587553.0, "step": 7700 }, { "epoch": 3.5920745920745922, "grad_norm": 0.32899882379271284, "learning_rate": 1.4071353152890936e-05, "loss": 0.3564, "num_tokens": 2016898273.0, "step": 7705 }, { "epoch": 3.594405594405594, "grad_norm": 0.32893163008661985, "learning_rate": 1.4043534369710307e-05, "loss": 0.3618, "num_tokens": 2018208993.0, "step": 7710 }, { "epoch": 3.596736596736597, "grad_norm": 0.33890514726280374, "learning_rate": 1.4015747574950597e-05, "loss": 0.3585, "num_tokens": 2019519713.0, "step": 7715 }, { "epoch": 3.599067599067599, "grad_norm": 0.35025561594471827, "learning_rate": 1.3987992834665963e-05, "loss": 0.3764, "num_tokens": 2020817105.0, "step": 7720 }, { "epoch": 3.6013986013986012, "grad_norm": 0.322499498679104, "learning_rate": 1.3960270214834381e-05, "loss": 0.3557, "num_tokens": 2022127825.0, "step": 7725 }, { "epoch": 3.6037296037296036, "grad_norm": 0.32465177989473437, "learning_rate": 1.3932579781357477e-05, "loss": 0.3528, "num_tokens": 2023438545.0, "step": 7730 }, { "epoch": 3.606060606060606, "grad_norm": 0.32534204421549856, "learning_rate": 1.390492160006035e-05, "loss": 0.3514, "num_tokens": 2024749265.0, "step": 7735 }, { "epoch": 3.6083916083916083, "grad_norm": 0.3348791455491572, "learning_rate": 1.3877295736691408e-05, "loss": 0.3548, "num_tokens": 2026059985.0, "step": 7740 }, { "epoch": 3.6107226107226107, "grad_norm": 0.30376460026082297, "learning_rate": 1.3849702256922309e-05, "loss": 0.3517, "num_tokens": 2027370705.0, "step": 7745 }, { "epoch": 3.613053613053613, "grad_norm": 0.3202796706000662, "learning_rate": 1.3822141226347646e-05, "loss": 0.3661, "num_tokens": 2028678090.0, "step": 7750 }, { "epoch": 3.6153846153846154, "grad_norm": 0.32249662232530607, "learning_rate": 1.3794612710484905e-05, "loss": 0.351, "num_tokens": 2029988810.0, "step": 7755 }, { "epoch": 3.617715617715618, "grad_norm": 0.3475421569483326, "learning_rate": 1.3767116774774307e-05, "loss": 0.3744, "num_tokens": 2031288913.0, "step": 7760 }, { "epoch": 3.62004662004662, "grad_norm": 0.33679545809282563, "learning_rate": 1.3739653484578586e-05, "loss": 0.3555, "num_tokens": 2032589943.0, "step": 7765 }, { "epoch": 3.6223776223776225, "grad_norm": 0.340971223695831, "learning_rate": 1.3712222905182881e-05, "loss": 0.3499, "num_tokens": 2033900663.0, "step": 7770 }, { "epoch": 3.624708624708625, "grad_norm": 0.3039493470993974, "learning_rate": 1.3684825101794575e-05, "loss": 0.3514, "num_tokens": 2035211383.0, "step": 7775 }, { "epoch": 3.6270396270396272, "grad_norm": 0.3352791563799734, "learning_rate": 1.3657460139543155e-05, "loss": 0.3626, "num_tokens": 2036522103.0, "step": 7780 }, { "epoch": 3.629370629370629, "grad_norm": 0.34252274886593165, "learning_rate": 1.3630128083479998e-05, "loss": 0.3504, "num_tokens": 2037832823.0, "step": 7785 }, { "epoch": 3.631701631701632, "grad_norm": 0.3513374308995934, "learning_rate": 1.3602828998578293e-05, "loss": 0.3684, "num_tokens": 2039143543.0, "step": 7790 }, { "epoch": 3.634032634032634, "grad_norm": 0.33422112695710476, "learning_rate": 1.3575562949732845e-05, "loss": 0.3584, "num_tokens": 2040454263.0, "step": 7795 }, { "epoch": 3.6363636363636362, "grad_norm": 0.35082577383897384, "learning_rate": 1.3548330001759898e-05, "loss": 0.3797, "num_tokens": 2041764983.0, "step": 7800 }, { "epoch": 3.6386946386946386, "grad_norm": 0.3188497601819274, "learning_rate": 1.352113021939705e-05, "loss": 0.3511, "num_tokens": 2043075703.0, "step": 7805 }, { "epoch": 3.641025641025641, "grad_norm": 0.34579309685383136, "learning_rate": 1.3493963667303036e-05, "loss": 0.3563, "num_tokens": 2044386423.0, "step": 7810 }, { "epoch": 3.6433566433566433, "grad_norm": 0.3507371852839906, "learning_rate": 1.3466830410057588e-05, "loss": 0.3416, "num_tokens": 2045697143.0, "step": 7815 }, { "epoch": 3.6456876456876457, "grad_norm": 0.3437622175939715, "learning_rate": 1.343973051216131e-05, "loss": 0.3652, "num_tokens": 2047007863.0, "step": 7820 }, { "epoch": 3.648018648018648, "grad_norm": 0.31415387046199733, "learning_rate": 1.3412664038035507e-05, "loss": 0.3619, "num_tokens": 2048318583.0, "step": 7825 }, { "epoch": 3.6503496503496504, "grad_norm": 0.3398023578909356, "learning_rate": 1.338563105202201e-05, "loss": 0.35, "num_tokens": 2049629303.0, "step": 7830 }, { "epoch": 3.652680652680653, "grad_norm": 0.3850082776958453, "learning_rate": 1.3358631618383041e-05, "loss": 0.3495, "num_tokens": 2050940023.0, "step": 7835 }, { "epoch": 3.655011655011655, "grad_norm": 0.3506882837431775, "learning_rate": 1.3331665801301085e-05, "loss": 0.3587, "num_tokens": 2052245292.0, "step": 7840 }, { "epoch": 3.6573426573426575, "grad_norm": 0.33043151182996516, "learning_rate": 1.3304733664878714e-05, "loss": 0.3757, "num_tokens": 2053556012.0, "step": 7845 }, { "epoch": 3.6596736596736594, "grad_norm": 0.32244585116058333, "learning_rate": 1.32778352731384e-05, "loss": 0.3674, "num_tokens": 2054864779.0, "step": 7850 }, { "epoch": 3.6620046620046622, "grad_norm": 0.34649671127745907, "learning_rate": 1.3250970690022435e-05, "loss": 0.3607, "num_tokens": 2056173760.0, "step": 7855 }, { "epoch": 3.664335664335664, "grad_norm": 0.3592288768686839, "learning_rate": 1.3224139979392739e-05, "loss": 0.3483, "num_tokens": 2057484480.0, "step": 7860 }, { "epoch": 3.6666666666666665, "grad_norm": 0.33267649430105906, "learning_rate": 1.3197343205030677e-05, "loss": 0.3601, "num_tokens": 2058795200.0, "step": 7865 }, { "epoch": 3.668997668997669, "grad_norm": 0.3155221547689689, "learning_rate": 1.317058043063698e-05, "loss": 0.3592, "num_tokens": 2060105920.0, "step": 7870 }, { "epoch": 3.6713286713286712, "grad_norm": 0.32504676536520444, "learning_rate": 1.3143851719831545e-05, "loss": 0.347, "num_tokens": 2061416640.0, "step": 7875 }, { "epoch": 3.6736596736596736, "grad_norm": 0.3390312427438953, "learning_rate": 1.3117157136153275e-05, "loss": 0.3598, "num_tokens": 2062727360.0, "step": 7880 }, { "epoch": 3.675990675990676, "grad_norm": 0.3272385721818141, "learning_rate": 1.3090496743059963e-05, "loss": 0.3457, "num_tokens": 2064038080.0, "step": 7885 }, { "epoch": 3.6783216783216783, "grad_norm": 0.3406171598088719, "learning_rate": 1.3063870603928135e-05, "loss": 0.3619, "num_tokens": 2065348800.0, "step": 7890 }, { "epoch": 3.6806526806526807, "grad_norm": 0.3199792712358228, "learning_rate": 1.3037278782052863e-05, "loss": 0.3676, "num_tokens": 2066659520.0, "step": 7895 }, { "epoch": 3.682983682983683, "grad_norm": 0.32349347623327407, "learning_rate": 1.3010721340647672e-05, "loss": 0.351, "num_tokens": 2067970240.0, "step": 7900 }, { "epoch": 3.6853146853146854, "grad_norm": 0.3197976897891316, "learning_rate": 1.2984198342844317e-05, "loss": 0.3507, "num_tokens": 2069280960.0, "step": 7905 }, { "epoch": 3.687645687645688, "grad_norm": 0.31153094945488263, "learning_rate": 1.2957709851692709e-05, "loss": 0.3531, "num_tokens": 2070591680.0, "step": 7910 }, { "epoch": 3.6899766899766897, "grad_norm": 0.34104591153105757, "learning_rate": 1.293125593016073e-05, "loss": 0.352, "num_tokens": 2071902400.0, "step": 7915 }, { "epoch": 3.6923076923076925, "grad_norm": 0.32756629305528456, "learning_rate": 1.2904836641134058e-05, "loss": 0.3609, "num_tokens": 2073193557.0, "step": 7920 }, { "epoch": 3.6946386946386944, "grad_norm": 0.32387204375058537, "learning_rate": 1.2878452047416065e-05, "loss": 0.3558, "num_tokens": 2074504277.0, "step": 7925 }, { "epoch": 3.6969696969696972, "grad_norm": 0.34675201672498257, "learning_rate": 1.2852102211727648e-05, "loss": 0.3616, "num_tokens": 2075814997.0, "step": 7930 }, { "epoch": 3.699300699300699, "grad_norm": 0.3247586444847732, "learning_rate": 1.2825787196707059e-05, "loss": 0.349, "num_tokens": 2077125717.0, "step": 7935 }, { "epoch": 3.7016317016317015, "grad_norm": 0.3156890734127022, "learning_rate": 1.2799507064909787e-05, "loss": 0.3533, "num_tokens": 2078436437.0, "step": 7940 }, { "epoch": 3.703962703962704, "grad_norm": 0.3227247560032221, "learning_rate": 1.2773261878808413e-05, "loss": 0.3466, "num_tokens": 2079747157.0, "step": 7945 }, { "epoch": 3.7062937062937062, "grad_norm": 0.3216324993277899, "learning_rate": 1.2747051700792412e-05, "loss": 0.3554, "num_tokens": 2081057877.0, "step": 7950 }, { "epoch": 3.7086247086247086, "grad_norm": 0.31249996903728416, "learning_rate": 1.2720876593168052e-05, "loss": 0.3492, "num_tokens": 2082368597.0, "step": 7955 }, { "epoch": 3.710955710955711, "grad_norm": 0.3188316644091349, "learning_rate": 1.2694736618158249e-05, "loss": 0.3458, "num_tokens": 2083679317.0, "step": 7960 }, { "epoch": 3.7132867132867133, "grad_norm": 0.3335771049219983, "learning_rate": 1.2668631837902389e-05, "loss": 0.3424, "num_tokens": 2084990037.0, "step": 7965 }, { "epoch": 3.7156177156177157, "grad_norm": 0.3499985089110688, "learning_rate": 1.2642562314456185e-05, "loss": 0.3534, "num_tokens": 2086300757.0, "step": 7970 }, { "epoch": 3.717948717948718, "grad_norm": 0.3410878602651583, "learning_rate": 1.2616528109791554e-05, "loss": 0.3659, "num_tokens": 2087611477.0, "step": 7975 }, { "epoch": 3.7202797202797204, "grad_norm": 0.3527728670467037, "learning_rate": 1.259052928579646e-05, "loss": 0.3591, "num_tokens": 2088908084.0, "step": 7980 }, { "epoch": 3.722610722610723, "grad_norm": 0.3510762979938676, "learning_rate": 1.2564565904274722e-05, "loss": 0.368, "num_tokens": 2090210123.0, "step": 7985 }, { "epoch": 3.7249417249417247, "grad_norm": 0.37404478878722747, "learning_rate": 1.2538638026945954e-05, "loss": 0.3647, "num_tokens": 2091520843.0, "step": 7990 }, { "epoch": 3.7272727272727275, "grad_norm": 0.3253947492356622, "learning_rate": 1.2512745715445345e-05, "loss": 0.3691, "num_tokens": 2092831563.0, "step": 7995 }, { "epoch": 3.7296037296037294, "grad_norm": 0.3730204049764418, "learning_rate": 1.2486889031323528e-05, "loss": 0.3568, "num_tokens": 2094140244.0, "step": 8000 }, { "epoch": 3.731934731934732, "grad_norm": 0.322985878009181, "learning_rate": 1.2461068036046474e-05, "loss": 0.3558, "num_tokens": 2095450964.0, "step": 8005 }, { "epoch": 3.734265734265734, "grad_norm": 0.3451139274796168, "learning_rate": 1.2435282790995294e-05, "loss": 0.3568, "num_tokens": 2096761684.0, "step": 8010 }, { "epoch": 3.7365967365967365, "grad_norm": 0.32232149066408405, "learning_rate": 1.240953335746611e-05, "loss": 0.3592, "num_tokens": 2098072404.0, "step": 8015 }, { "epoch": 3.738927738927739, "grad_norm": 0.33356015443779835, "learning_rate": 1.2383819796669929e-05, "loss": 0.3485, "num_tokens": 2099383124.0, "step": 8020 }, { "epoch": 3.7412587412587412, "grad_norm": 0.31783927007138674, "learning_rate": 1.235814216973248e-05, "loss": 0.362, "num_tokens": 2100693844.0, "step": 8025 }, { "epoch": 3.7435897435897436, "grad_norm": 0.3258650151987321, "learning_rate": 1.2332500537694061e-05, "loss": 0.3643, "num_tokens": 2102004564.0, "step": 8030 }, { "epoch": 3.745920745920746, "grad_norm": 0.31862835430157704, "learning_rate": 1.2306894961509392e-05, "loss": 0.3559, "num_tokens": 2103315284.0, "step": 8035 }, { "epoch": 3.7482517482517483, "grad_norm": 0.31403800945327537, "learning_rate": 1.2281325502047526e-05, "loss": 0.352, "num_tokens": 2104610250.0, "step": 8040 }, { "epoch": 3.7505827505827507, "grad_norm": 0.3364153402051559, "learning_rate": 1.2255792220091623e-05, "loss": 0.3605, "num_tokens": 2105920970.0, "step": 8045 }, { "epoch": 3.752913752913753, "grad_norm": 0.33818645363457694, "learning_rate": 1.2230295176338843e-05, "loss": 0.3528, "num_tokens": 2107231690.0, "step": 8050 }, { "epoch": 3.755244755244755, "grad_norm": 0.33218579315347985, "learning_rate": 1.2204834431400218e-05, "loss": 0.3646, "num_tokens": 2108542410.0, "step": 8055 }, { "epoch": 3.757575757575758, "grad_norm": 0.3150157899865806, "learning_rate": 1.2179410045800486e-05, "loss": 0.3678, "num_tokens": 2109845114.0, "step": 8060 }, { "epoch": 3.7599067599067597, "grad_norm": 0.3239476998553056, "learning_rate": 1.2154022079977941e-05, "loss": 0.364, "num_tokens": 2111155834.0, "step": 8065 }, { "epoch": 3.762237762237762, "grad_norm": 0.30771382308356826, "learning_rate": 1.2128670594284317e-05, "loss": 0.3656, "num_tokens": 2112466554.0, "step": 8070 }, { "epoch": 3.7645687645687644, "grad_norm": 0.36628760033281316, "learning_rate": 1.2103355648984627e-05, "loss": 0.3539, "num_tokens": 2113777274.0, "step": 8075 }, { "epoch": 3.766899766899767, "grad_norm": 0.3435307692569418, "learning_rate": 1.2078077304256999e-05, "loss": 0.3698, "num_tokens": 2115087994.0, "step": 8080 }, { "epoch": 3.769230769230769, "grad_norm": 0.31904278075717907, "learning_rate": 1.2052835620192577e-05, "loss": 0.3539, "num_tokens": 2116398714.0, "step": 8085 }, { "epoch": 3.7715617715617715, "grad_norm": 0.33066965750120453, "learning_rate": 1.2027630656795365e-05, "loss": 0.3608, "num_tokens": 2117709434.0, "step": 8090 }, { "epoch": 3.773892773892774, "grad_norm": 0.3254496733507665, "learning_rate": 1.2002462473982034e-05, "loss": 0.3646, "num_tokens": 2119020154.0, "step": 8095 }, { "epoch": 3.7762237762237763, "grad_norm": 0.318029815318303, "learning_rate": 1.1977331131581872e-05, "loss": 0.3643, "num_tokens": 2120330874.0, "step": 8100 }, { "epoch": 3.7785547785547786, "grad_norm": 0.33657956842752124, "learning_rate": 1.1952236689336547e-05, "loss": 0.3483, "num_tokens": 2121641594.0, "step": 8105 }, { "epoch": 3.780885780885781, "grad_norm": 0.339494433625539, "learning_rate": 1.1927179206900036e-05, "loss": 0.3624, "num_tokens": 2122952314.0, "step": 8110 }, { "epoch": 3.7832167832167833, "grad_norm": 0.32937653713889037, "learning_rate": 1.1902158743838455e-05, "loss": 0.3578, "num_tokens": 2124263034.0, "step": 8115 }, { "epoch": 3.7855477855477857, "grad_norm": 0.31184931579510083, "learning_rate": 1.1877175359629895e-05, "loss": 0.3515, "num_tokens": 2125564666.0, "step": 8120 }, { "epoch": 3.787878787878788, "grad_norm": 0.32358413426775373, "learning_rate": 1.185222911366433e-05, "loss": 0.3634, "num_tokens": 2126875386.0, "step": 8125 }, { "epoch": 3.79020979020979, "grad_norm": 0.34491635141056515, "learning_rate": 1.1827320065243442e-05, "loss": 0.3663, "num_tokens": 2128186106.0, "step": 8130 }, { "epoch": 3.792540792540793, "grad_norm": 0.3263014090650137, "learning_rate": 1.1802448273580482e-05, "loss": 0.3531, "num_tokens": 2129496826.0, "step": 8135 }, { "epoch": 3.7948717948717947, "grad_norm": 0.34055454076158553, "learning_rate": 1.1777613797800132e-05, "loss": 0.3526, "num_tokens": 2130807546.0, "step": 8140 }, { "epoch": 3.797202797202797, "grad_norm": 0.35098519401821454, "learning_rate": 1.175281669693839e-05, "loss": 0.3567, "num_tokens": 2132118266.0, "step": 8145 }, { "epoch": 3.7995337995337994, "grad_norm": 0.3266773596447305, "learning_rate": 1.1728057029942377e-05, "loss": 0.3531, "num_tokens": 2133428986.0, "step": 8150 }, { "epoch": 3.801864801864802, "grad_norm": 0.31360535728361383, "learning_rate": 1.170333485567025e-05, "loss": 0.3674, "num_tokens": 2134739706.0, "step": 8155 }, { "epoch": 3.804195804195804, "grad_norm": 0.31624940012549485, "learning_rate": 1.1678650232891021e-05, "loss": 0.3518, "num_tokens": 2136050426.0, "step": 8160 }, { "epoch": 3.8065268065268065, "grad_norm": 0.3141407909472399, "learning_rate": 1.1654003220284459e-05, "loss": 0.3619, "num_tokens": 2137356817.0, "step": 8165 }, { "epoch": 3.808857808857809, "grad_norm": 0.34368525998084476, "learning_rate": 1.1629393876440894e-05, "loss": 0.3526, "num_tokens": 2138667537.0, "step": 8170 }, { "epoch": 3.8111888111888113, "grad_norm": 0.33230532572557525, "learning_rate": 1.1604822259861143e-05, "loss": 0.3554, "num_tokens": 2139978257.0, "step": 8175 }, { "epoch": 3.8135198135198136, "grad_norm": 0.3285215453270888, "learning_rate": 1.1580288428956326e-05, "loss": 0.3545, "num_tokens": 2141288977.0, "step": 8180 }, { "epoch": 3.815850815850816, "grad_norm": 0.32253369632017526, "learning_rate": 1.1555792442047727e-05, "loss": 0.3545, "num_tokens": 2142599697.0, "step": 8185 }, { "epoch": 3.8181818181818183, "grad_norm": 0.31275107239667255, "learning_rate": 1.1531334357366687e-05, "loss": 0.3648, "num_tokens": 2143910417.0, "step": 8190 }, { "epoch": 3.8205128205128203, "grad_norm": 0.3371090455672606, "learning_rate": 1.1506914233054449e-05, "loss": 0.3548, "num_tokens": 2145215363.0, "step": 8195 }, { "epoch": 3.822843822843823, "grad_norm": 0.3330220005685937, "learning_rate": 1.1482532127161987e-05, "loss": 0.3682, "num_tokens": 2146526083.0, "step": 8200 }, { "epoch": 3.825174825174825, "grad_norm": 0.3163036289984921, "learning_rate": 1.1458188097649931e-05, "loss": 0.3652, "num_tokens": 2147828267.0, "step": 8205 }, { "epoch": 3.8275058275058274, "grad_norm": 0.31880150929387496, "learning_rate": 1.143388220238839e-05, "loss": 0.3612, "num_tokens": 2149138987.0, "step": 8210 }, { "epoch": 3.8298368298368297, "grad_norm": 0.319810402665015, "learning_rate": 1.1409614499156807e-05, "loss": 0.355, "num_tokens": 2150449707.0, "step": 8215 }, { "epoch": 3.832167832167832, "grad_norm": 0.3394499363463578, "learning_rate": 1.138538504564384e-05, "loss": 0.3543, "num_tokens": 2151760427.0, "step": 8220 }, { "epoch": 3.8344988344988344, "grad_norm": 0.3339998319196104, "learning_rate": 1.1361193899447239e-05, "loss": 0.3643, "num_tokens": 2153071147.0, "step": 8225 }, { "epoch": 3.836829836829837, "grad_norm": 0.3283698373871858, "learning_rate": 1.1337041118073673e-05, "loss": 0.365, "num_tokens": 2154381867.0, "step": 8230 }, { "epoch": 3.839160839160839, "grad_norm": 0.31803261230391683, "learning_rate": 1.1312926758938598e-05, "loss": 0.3542, "num_tokens": 2155692587.0, "step": 8235 }, { "epoch": 3.8414918414918415, "grad_norm": 0.32415297440890517, "learning_rate": 1.1288850879366178e-05, "loss": 0.3476, "num_tokens": 2157003307.0, "step": 8240 }, { "epoch": 3.843822843822844, "grad_norm": 0.32171582377815044, "learning_rate": 1.1264813536589063e-05, "loss": 0.3505, "num_tokens": 2158302271.0, "step": 8245 }, { "epoch": 3.8461538461538463, "grad_norm": 0.32594823868385486, "learning_rate": 1.1240814787748294e-05, "loss": 0.3542, "num_tokens": 2159612991.0, "step": 8250 }, { "epoch": 3.8484848484848486, "grad_norm": 0.3130567477371064, "learning_rate": 1.1216854689893208e-05, "loss": 0.3474, "num_tokens": 2160923711.0, "step": 8255 }, { "epoch": 3.8508158508158505, "grad_norm": 0.32884981317619555, "learning_rate": 1.119293329998122e-05, "loss": 0.3613, "num_tokens": 2162234431.0, "step": 8260 }, { "epoch": 3.8531468531468533, "grad_norm": 0.3182477239378319, "learning_rate": 1.116905067487774e-05, "loss": 0.3496, "num_tokens": 2163545151.0, "step": 8265 }, { "epoch": 3.8554778554778553, "grad_norm": 0.3238906779321128, "learning_rate": 1.1145206871356035e-05, "loss": 0.3755, "num_tokens": 2164845172.0, "step": 8270 }, { "epoch": 3.857808857808858, "grad_norm": 0.3230035270398852, "learning_rate": 1.1121401946097089e-05, "loss": 0.3579, "num_tokens": 2166155892.0, "step": 8275 }, { "epoch": 3.86013986013986, "grad_norm": 0.34202250350581626, "learning_rate": 1.1097635955689447e-05, "loss": 0.367, "num_tokens": 2167466612.0, "step": 8280 }, { "epoch": 3.8624708624708624, "grad_norm": 0.33282529022268353, "learning_rate": 1.107390895662912e-05, "loss": 0.3634, "num_tokens": 2168777332.0, "step": 8285 }, { "epoch": 3.8648018648018647, "grad_norm": 0.34181595087542227, "learning_rate": 1.1050221005319422e-05, "loss": 0.3674, "num_tokens": 2170088052.0, "step": 8290 }, { "epoch": 3.867132867132867, "grad_norm": 0.3036597141524631, "learning_rate": 1.1026572158070831e-05, "loss": 0.3555, "num_tokens": 2171398772.0, "step": 8295 }, { "epoch": 3.8694638694638694, "grad_norm": 0.32232320158504335, "learning_rate": 1.1002962471100883e-05, "loss": 0.3548, "num_tokens": 2172709492.0, "step": 8300 }, { "epoch": 3.871794871794872, "grad_norm": 0.3210640108188077, "learning_rate": 1.0979392000534027e-05, "loss": 0.3585, "num_tokens": 2174020212.0, "step": 8305 }, { "epoch": 3.874125874125874, "grad_norm": 0.3273031634760707, "learning_rate": 1.0955860802401465e-05, "loss": 0.3868, "num_tokens": 2175330932.0, "step": 8310 }, { "epoch": 3.8764568764568765, "grad_norm": 0.31087673967738866, "learning_rate": 1.0932368932641074e-05, "loss": 0.3553, "num_tokens": 2176633046.0, "step": 8315 }, { "epoch": 3.878787878787879, "grad_norm": 0.3289983288827823, "learning_rate": 1.0908916447097199e-05, "loss": 0.3661, "num_tokens": 2177943766.0, "step": 8320 }, { "epoch": 3.8811188811188813, "grad_norm": 0.35024193147589183, "learning_rate": 1.0885503401520598e-05, "loss": 0.3489, "num_tokens": 2179254486.0, "step": 8325 }, { "epoch": 3.8834498834498836, "grad_norm": 0.3317648553400327, "learning_rate": 1.0862129851568261e-05, "loss": 0.3525, "num_tokens": 2180565206.0, "step": 8330 }, { "epoch": 3.8857808857808855, "grad_norm": 0.34059702892010785, "learning_rate": 1.0838795852803285e-05, "loss": 0.3658, "num_tokens": 2181875926.0, "step": 8335 }, { "epoch": 3.8881118881118883, "grad_norm": 0.33839087803574835, "learning_rate": 1.0815501460694752e-05, "loss": 0.357, "num_tokens": 2183186646.0, "step": 8340 }, { "epoch": 3.8904428904428903, "grad_norm": 0.33882338228224024, "learning_rate": 1.0792246730617587e-05, "loss": 0.352, "num_tokens": 2184497366.0, "step": 8345 }, { "epoch": 3.8927738927738926, "grad_norm": 0.33693236579652286, "learning_rate": 1.0769031717852435e-05, "loss": 0.3518, "num_tokens": 2185808086.0, "step": 8350 }, { "epoch": 3.895104895104895, "grad_norm": 0.330046544608827, "learning_rate": 1.0745856477585534e-05, "loss": 0.3645, "num_tokens": 2187118806.0, "step": 8355 }, { "epoch": 3.8974358974358974, "grad_norm": 0.3214183099709155, "learning_rate": 1.0722721064908554e-05, "loss": 0.3602, "num_tokens": 2188419457.0, "step": 8360 }, { "epoch": 3.8997668997668997, "grad_norm": 0.3166655849628983, "learning_rate": 1.0699625534818512e-05, "loss": 0.3618, "num_tokens": 2189730177.0, "step": 8365 }, { "epoch": 3.902097902097902, "grad_norm": 0.31754371228858536, "learning_rate": 1.0676569942217596e-05, "loss": 0.3628, "num_tokens": 2191040897.0, "step": 8370 }, { "epoch": 3.9044289044289044, "grad_norm": 0.3060117009044129, "learning_rate": 1.0653554341913072e-05, "loss": 0.3535, "num_tokens": 2192351617.0, "step": 8375 }, { "epoch": 3.906759906759907, "grad_norm": 0.3435379540747769, "learning_rate": 1.0630578788617131e-05, "loss": 0.3642, "num_tokens": 2193648973.0, "step": 8380 }, { "epoch": 3.909090909090909, "grad_norm": 0.3389237512793272, "learning_rate": 1.060764333694676e-05, "loss": 0.3509, "num_tokens": 2194959693.0, "step": 8385 }, { "epoch": 3.9114219114219115, "grad_norm": 0.32842554162135046, "learning_rate": 1.0584748041423623e-05, "loss": 0.3556, "num_tokens": 2196265846.0, "step": 8390 }, { "epoch": 3.913752913752914, "grad_norm": 0.33837517172721177, "learning_rate": 1.0561892956473932e-05, "loss": 0.3573, "num_tokens": 2197568195.0, "step": 8395 }, { "epoch": 3.916083916083916, "grad_norm": 0.3218786208698996, "learning_rate": 1.0539078136428294e-05, "loss": 0.3634, "num_tokens": 2198878915.0, "step": 8400 }, { "epoch": 3.9184149184149186, "grad_norm": 0.351508799242857, "learning_rate": 1.0516303635521606e-05, "loss": 0.3753, "num_tokens": 2200185526.0, "step": 8405 }, { "epoch": 3.9207459207459205, "grad_norm": 0.34131167249228345, "learning_rate": 1.0493569507892938e-05, "loss": 0.3613, "num_tokens": 2201496246.0, "step": 8410 }, { "epoch": 3.9230769230769234, "grad_norm": 0.3318435319138198, "learning_rate": 1.0470875807585354e-05, "loss": 0.3572, "num_tokens": 2202799214.0, "step": 8415 }, { "epoch": 3.9254079254079253, "grad_norm": 0.33378030300757455, "learning_rate": 1.0448222588545837e-05, "loss": 0.3565, "num_tokens": 2204109934.0, "step": 8420 }, { "epoch": 3.9277389277389276, "grad_norm": 0.33527036359922735, "learning_rate": 1.0425609904625137e-05, "loss": 0.3599, "num_tokens": 2205420654.0, "step": 8425 }, { "epoch": 3.93006993006993, "grad_norm": 0.31167055961688644, "learning_rate": 1.0403037809577636e-05, "loss": 0.3581, "num_tokens": 2206731374.0, "step": 8430 }, { "epoch": 3.9324009324009324, "grad_norm": 0.3311994709997141, "learning_rate": 1.0380506357061221e-05, "loss": 0.3695, "num_tokens": 2208042094.0, "step": 8435 }, { "epoch": 3.9347319347319347, "grad_norm": 0.30544971351661804, "learning_rate": 1.03580156006372e-05, "loss": 0.3575, "num_tokens": 2209352814.0, "step": 8440 }, { "epoch": 3.937062937062937, "grad_norm": 0.3163147870974832, "learning_rate": 1.0335565593770102e-05, "loss": 0.3519, "num_tokens": 2210663534.0, "step": 8445 }, { "epoch": 3.9393939393939394, "grad_norm": 0.31310671731601936, "learning_rate": 1.0313156389827596e-05, "loss": 0.3589, "num_tokens": 2211974254.0, "step": 8450 }, { "epoch": 3.941724941724942, "grad_norm": 0.3359729029408067, "learning_rate": 1.0290788042080375e-05, "loss": 0.3617, "num_tokens": 2213279287.0, "step": 8455 }, { "epoch": 3.944055944055944, "grad_norm": 0.33715090925620084, "learning_rate": 1.026846060370199e-05, "loss": 0.3555, "num_tokens": 2214584857.0, "step": 8460 }, { "epoch": 3.9463869463869465, "grad_norm": 0.30902647410730066, "learning_rate": 1.0246174127768738e-05, "loss": 0.3595, "num_tokens": 2215888814.0, "step": 8465 }, { "epoch": 3.948717948717949, "grad_norm": 0.3291193720775341, "learning_rate": 1.0223928667259556e-05, "loss": 0.3673, "num_tokens": 2217199534.0, "step": 8470 }, { "epoch": 3.951048951048951, "grad_norm": 0.34291852278637736, "learning_rate": 1.020172427505588e-05, "loss": 0.3525, "num_tokens": 2218509771.0, "step": 8475 }, { "epoch": 3.9533799533799536, "grad_norm": 0.3561232943784015, "learning_rate": 1.0179561003941507e-05, "loss": 0.3538, "num_tokens": 2219820491.0, "step": 8480 }, { "epoch": 3.9557109557109555, "grad_norm": 0.3157272502198812, "learning_rate": 1.0157438906602487e-05, "loss": 0.3524, "num_tokens": 2221130082.0, "step": 8485 }, { "epoch": 3.958041958041958, "grad_norm": 0.33080441973925323, "learning_rate": 1.0135358035627007e-05, "loss": 0.3614, "num_tokens": 2222424293.0, "step": 8490 }, { "epoch": 3.9603729603729603, "grad_norm": 0.3289328935479798, "learning_rate": 1.0113318443505226e-05, "loss": 0.3659, "num_tokens": 2223735013.0, "step": 8495 }, { "epoch": 3.9627039627039626, "grad_norm": 0.31711304131356893, "learning_rate": 1.0091320182629193e-05, "loss": 0.3653, "num_tokens": 2225045733.0, "step": 8500 }, { "epoch": 3.965034965034965, "grad_norm": 0.323224468581729, "learning_rate": 1.0069363305292708e-05, "loss": 0.3628, "num_tokens": 2226356453.0, "step": 8505 }, { "epoch": 3.9673659673659674, "grad_norm": 0.3399766154632268, "learning_rate": 1.0047447863691175e-05, "loss": 0.3523, "num_tokens": 2227667173.0, "step": 8510 }, { "epoch": 3.9696969696969697, "grad_norm": 0.31305493281081237, "learning_rate": 1.0025573909921515e-05, "loss": 0.3553, "num_tokens": 2228973398.0, "step": 8515 }, { "epoch": 3.972027972027972, "grad_norm": 0.3312572614176095, "learning_rate": 1.0003741495982034e-05, "loss": 0.3563, "num_tokens": 2230272637.0, "step": 8520 }, { "epoch": 3.9743589743589745, "grad_norm": 0.3358237225226056, "learning_rate": 9.981950673772256e-06, "loss": 0.3611, "num_tokens": 2231583357.0, "step": 8525 }, { "epoch": 3.976689976689977, "grad_norm": 0.31567748638452275, "learning_rate": 9.960201495092871e-06, "loss": 0.37, "num_tokens": 2232894077.0, "step": 8530 }, { "epoch": 3.979020979020979, "grad_norm": 0.33864552842513596, "learning_rate": 9.938494011645553e-06, "loss": 0.3614, "num_tokens": 2234204797.0, "step": 8535 }, { "epoch": 3.981351981351981, "grad_norm": 0.3294973703926195, "learning_rate": 9.916828275032868e-06, "loss": 0.3585, "num_tokens": 2235502698.0, "step": 8540 }, { "epoch": 3.983682983682984, "grad_norm": 0.324896447348713, "learning_rate": 9.895204336758132e-06, "loss": 0.3539, "num_tokens": 2236813418.0, "step": 8545 }, { "epoch": 3.986013986013986, "grad_norm": 0.31758055886736325, "learning_rate": 9.87362224822531e-06, "loss": 0.3543, "num_tokens": 2238114298.0, "step": 8550 }, { "epoch": 3.988344988344988, "grad_norm": 0.34580160569991886, "learning_rate": 9.85208206073889e-06, "loss": 0.3552, "num_tokens": 2239417127.0, "step": 8555 }, { "epoch": 3.9906759906759905, "grad_norm": 0.3386420228574512, "learning_rate": 9.830583825503725e-06, "loss": 0.3521, "num_tokens": 2240727847.0, "step": 8560 }, { "epoch": 3.993006993006993, "grad_norm": 0.32993309995325326, "learning_rate": 9.80912759362497e-06, "loss": 0.3504, "num_tokens": 2242038567.0, "step": 8565 }, { "epoch": 3.9953379953379953, "grad_norm": 0.34047566030766635, "learning_rate": 9.787713416107919e-06, "loss": 0.3535, "num_tokens": 2243349287.0, "step": 8570 }, { "epoch": 3.9976689976689976, "grad_norm": 0.32231730842208467, "learning_rate": 9.76634134385788e-06, "loss": 0.3576, "num_tokens": 2244653368.0, "step": 8575 }, { "epoch": 4.0, "grad_norm": 0.3196697658342358, "learning_rate": 9.745011427680106e-06, "loss": 0.3417, "num_tokens": 2245952708.0, "step": 8580 }, { "epoch": 4.002331002331002, "grad_norm": 0.3154639847560265, "learning_rate": 9.723723718279595e-06, "loss": 0.3027, "num_tokens": 2247263428.0, "step": 8585 }, { "epoch": 4.004662004662005, "grad_norm": 0.3627171601549924, "learning_rate": 9.702478266261042e-06, "loss": 0.3105, "num_tokens": 2248564079.0, "step": 8590 }, { "epoch": 4.006993006993007, "grad_norm": 0.36469519862133226, "learning_rate": 9.68127512212868e-06, "loss": 0.3218, "num_tokens": 2249874799.0, "step": 8595 }, { "epoch": 4.0093240093240095, "grad_norm": 0.3315289199193423, "learning_rate": 9.660114336286164e-06, "loss": 0.3212, "num_tokens": 2251185519.0, "step": 8600 }, { "epoch": 4.011655011655011, "grad_norm": 0.35875417884768623, "learning_rate": 9.638995959036456e-06, "loss": 0.3109, "num_tokens": 2252491344.0, "step": 8605 }, { "epoch": 4.013986013986014, "grad_norm": 0.32867156865417013, "learning_rate": 9.617920040581724e-06, "loss": 0.303, "num_tokens": 2253802064.0, "step": 8610 }, { "epoch": 4.016317016317016, "grad_norm": 0.310349653790998, "learning_rate": 9.596886631023169e-06, "loss": 0.3094, "num_tokens": 2255108675.0, "step": 8615 }, { "epoch": 4.018648018648019, "grad_norm": 0.32806845035826643, "learning_rate": 9.575895780360969e-06, "loss": 0.3207, "num_tokens": 2256419395.0, "step": 8620 }, { "epoch": 4.020979020979021, "grad_norm": 0.3217206000624844, "learning_rate": 9.55494753849413e-06, "loss": 0.3088, "num_tokens": 2257730115.0, "step": 8625 }, { "epoch": 4.023310023310024, "grad_norm": 0.33329725907377766, "learning_rate": 9.534041955220353e-06, "loss": 0.309, "num_tokens": 2259040835.0, "step": 8630 }, { "epoch": 4.0256410256410255, "grad_norm": 0.3237761512977047, "learning_rate": 9.513179080235933e-06, "loss": 0.3108, "num_tokens": 2260351555.0, "step": 8635 }, { "epoch": 4.027972027972028, "grad_norm": 0.3301327078317335, "learning_rate": 9.492358963135671e-06, "loss": 0.3075, "num_tokens": 2261662275.0, "step": 8640 }, { "epoch": 4.03030303030303, "grad_norm": 0.3257522642348306, "learning_rate": 9.47158165341269e-06, "loss": 0.3167, "num_tokens": 2262972995.0, "step": 8645 }, { "epoch": 4.032634032634032, "grad_norm": 0.3400399129587691, "learning_rate": 9.450847200458351e-06, "loss": 0.3144, "num_tokens": 2264283715.0, "step": 8650 }, { "epoch": 4.034965034965035, "grad_norm": 0.3279323276345651, "learning_rate": 9.430155653562176e-06, "loss": 0.3138, "num_tokens": 2265594435.0, "step": 8655 }, { "epoch": 4.037296037296037, "grad_norm": 0.34371766089078787, "learning_rate": 9.409507061911648e-06, "loss": 0.3153, "num_tokens": 2266897223.0, "step": 8660 }, { "epoch": 4.03962703962704, "grad_norm": 0.3237833855664639, "learning_rate": 9.38890147459216e-06, "loss": 0.3141, "num_tokens": 2268207943.0, "step": 8665 }, { "epoch": 4.041958041958042, "grad_norm": 0.33288798285203314, "learning_rate": 9.368338940586866e-06, "loss": 0.3144, "num_tokens": 2269518663.0, "step": 8670 }, { "epoch": 4.0442890442890445, "grad_norm": 0.34404924053052394, "learning_rate": 9.347819508776593e-06, "loss": 0.3142, "num_tokens": 2270829383.0, "step": 8675 }, { "epoch": 4.046620046620046, "grad_norm": 0.3366069132240311, "learning_rate": 9.327343227939677e-06, "loss": 0.3118, "num_tokens": 2272140103.0, "step": 8680 }, { "epoch": 4.048951048951049, "grad_norm": 0.33022829494586375, "learning_rate": 9.306910146751903e-06, "loss": 0.3025, "num_tokens": 2273448784.0, "step": 8685 }, { "epoch": 4.051282051282051, "grad_norm": 0.3362236980144924, "learning_rate": 9.286520313786359e-06, "loss": 0.3062, "num_tokens": 2274759504.0, "step": 8690 }, { "epoch": 4.053613053613054, "grad_norm": 0.3375407896276986, "learning_rate": 9.2661737775133e-06, "loss": 0.3115, "num_tokens": 2276070224.0, "step": 8695 }, { "epoch": 4.055944055944056, "grad_norm": 0.3387055817635362, "learning_rate": 9.245870586300086e-06, "loss": 0.3076, "num_tokens": 2277380944.0, "step": 8700 }, { "epoch": 4.058275058275059, "grad_norm": 0.33597806666914465, "learning_rate": 9.225610788411028e-06, "loss": 0.3124, "num_tokens": 2278683128.0, "step": 8705 }, { "epoch": 4.0606060606060606, "grad_norm": 0.33166354222510536, "learning_rate": 9.205394432007274e-06, "loss": 0.3195, "num_tokens": 2279993848.0, "step": 8710 }, { "epoch": 4.062937062937063, "grad_norm": 0.340915400738789, "learning_rate": 9.185221565146719e-06, "loss": 0.3129, "num_tokens": 2281304568.0, "step": 8715 }, { "epoch": 4.065268065268065, "grad_norm": 0.324138237680919, "learning_rate": 9.165092235783872e-06, "loss": 0.3026, "num_tokens": 2282615288.0, "step": 8720 }, { "epoch": 4.067599067599067, "grad_norm": 0.32248302399372863, "learning_rate": 9.145006491769734e-06, "loss": 0.3131, "num_tokens": 2283926008.0, "step": 8725 }, { "epoch": 4.06993006993007, "grad_norm": 0.3300129820806774, "learning_rate": 9.124964380851697e-06, "loss": 0.3147, "num_tokens": 2285236728.0, "step": 8730 }, { "epoch": 4.072261072261072, "grad_norm": 0.3281535752862011, "learning_rate": 9.104965950673457e-06, "loss": 0.317, "num_tokens": 2286547448.0, "step": 8735 }, { "epoch": 4.074592074592075, "grad_norm": 0.33825429279702496, "learning_rate": 9.085011248774844e-06, "loss": 0.3056, "num_tokens": 2287858168.0, "step": 8740 }, { "epoch": 4.076923076923077, "grad_norm": 0.33028034320434174, "learning_rate": 9.065100322591735e-06, "loss": 0.3084, "num_tokens": 2289164548.0, "step": 8745 }, { "epoch": 4.0792540792540795, "grad_norm": 0.3353664679588956, "learning_rate": 9.045233219455967e-06, "loss": 0.3257, "num_tokens": 2290475268.0, "step": 8750 }, { "epoch": 4.081585081585081, "grad_norm": 0.3315814088066854, "learning_rate": 9.025409986595191e-06, "loss": 0.3131, "num_tokens": 2291785988.0, "step": 8755 }, { "epoch": 4.083916083916084, "grad_norm": 0.32829158816501314, "learning_rate": 9.005630671132767e-06, "loss": 0.3247, "num_tokens": 2293090934.0, "step": 8760 }, { "epoch": 4.086247086247086, "grad_norm": 0.3400820384105229, "learning_rate": 8.985895320087657e-06, "loss": 0.322, "num_tokens": 2294401654.0, "step": 8765 }, { "epoch": 4.088578088578089, "grad_norm": 0.3469249174397503, "learning_rate": 8.96620398037432e-06, "loss": 0.3204, "num_tokens": 2295712374.0, "step": 8770 }, { "epoch": 4.090909090909091, "grad_norm": 0.33092792284960687, "learning_rate": 8.946556698802578e-06, "loss": 0.3171, "num_tokens": 2297023094.0, "step": 8775 }, { "epoch": 4.093240093240094, "grad_norm": 0.34641277690350863, "learning_rate": 8.926953522077528e-06, "loss": 0.314, "num_tokens": 2298333814.0, "step": 8780 }, { "epoch": 4.0955710955710956, "grad_norm": 0.32438963991128505, "learning_rate": 8.907394496799429e-06, "loss": 0.3143, "num_tokens": 2299637704.0, "step": 8785 }, { "epoch": 4.0979020979020975, "grad_norm": 0.33556755134102456, "learning_rate": 8.887879669463562e-06, "loss": 0.3144, "num_tokens": 2300946471.0, "step": 8790 }, { "epoch": 4.1002331002331, "grad_norm": 0.3370515747236785, "learning_rate": 8.868409086460167e-06, "loss": 0.3138, "num_tokens": 2302257191.0, "step": 8795 }, { "epoch": 4.102564102564102, "grad_norm": 0.32539271406101317, "learning_rate": 8.848982794074288e-06, "loss": 0.3013, "num_tokens": 2303567911.0, "step": 8800 }, { "epoch": 4.104895104895105, "grad_norm": 0.3348268658945831, "learning_rate": 8.829600838485691e-06, "loss": 0.3191, "num_tokens": 2304878631.0, "step": 8805 }, { "epoch": 4.107226107226107, "grad_norm": 0.3254305192351051, "learning_rate": 8.810263265768749e-06, "loss": 0.3097, "num_tokens": 2306189351.0, "step": 8810 }, { "epoch": 4.10955710955711, "grad_norm": 0.3528490799980861, "learning_rate": 8.790970121892318e-06, "loss": 0.3144, "num_tokens": 2307500071.0, "step": 8815 }, { "epoch": 4.111888111888112, "grad_norm": 0.34560029308995477, "learning_rate": 8.771721452719644e-06, "loss": 0.329, "num_tokens": 2308800878.0, "step": 8820 }, { "epoch": 4.1142191142191145, "grad_norm": 0.34271459470922583, "learning_rate": 8.752517304008263e-06, "loss": 0.3179, "num_tokens": 2310111598.0, "step": 8825 }, { "epoch": 4.116550116550116, "grad_norm": 0.34444328829999327, "learning_rate": 8.733357721409847e-06, "loss": 0.3074, "num_tokens": 2311422318.0, "step": 8830 }, { "epoch": 4.118881118881119, "grad_norm": 0.3345099910245634, "learning_rate": 8.714242750470155e-06, "loss": 0.3169, "num_tokens": 2312733038.0, "step": 8835 }, { "epoch": 4.121212121212121, "grad_norm": 0.336105428464414, "learning_rate": 8.695172436628885e-06, "loss": 0.3074, "num_tokens": 2314036848.0, "step": 8840 }, { "epoch": 4.123543123543124, "grad_norm": 0.3432184731029786, "learning_rate": 8.676146825219574e-06, "loss": 0.3244, "num_tokens": 2315347568.0, "step": 8845 }, { "epoch": 4.125874125874126, "grad_norm": 0.3514286741500932, "learning_rate": 8.657165961469496e-06, "loss": 0.3122, "num_tokens": 2316647401.0, "step": 8850 }, { "epoch": 4.128205128205128, "grad_norm": 0.3464258867521915, "learning_rate": 8.63822989049955e-06, "loss": 0.3121, "num_tokens": 2317958121.0, "step": 8855 }, { "epoch": 4.130536130536131, "grad_norm": 0.33188673347599557, "learning_rate": 8.619338657324167e-06, "loss": 0.308, "num_tokens": 2319268841.0, "step": 8860 }, { "epoch": 4.1328671328671325, "grad_norm": 0.33368696164039957, "learning_rate": 8.600492306851166e-06, "loss": 0.3115, "num_tokens": 2320579561.0, "step": 8865 }, { "epoch": 4.135198135198135, "grad_norm": 0.3233544453202552, "learning_rate": 8.581690883881696e-06, "loss": 0.3185, "num_tokens": 2321890281.0, "step": 8870 }, { "epoch": 4.137529137529137, "grad_norm": 0.33173085265251867, "learning_rate": 8.562934433110101e-06, "loss": 0.3081, "num_tokens": 2323194296.0, "step": 8875 }, { "epoch": 4.13986013986014, "grad_norm": 0.32262540271377654, "learning_rate": 8.544222999123798e-06, "loss": 0.3099, "num_tokens": 2324493603.0, "step": 8880 }, { "epoch": 4.142191142191142, "grad_norm": 0.3267911687013554, "learning_rate": 8.525556626403214e-06, "loss": 0.3149, "num_tokens": 2325804323.0, "step": 8885 }, { "epoch": 4.144522144522145, "grad_norm": 0.3668716182370164, "learning_rate": 8.506935359321655e-06, "loss": 0.317, "num_tokens": 2327101698.0, "step": 8890 }, { "epoch": 4.146853146853147, "grad_norm": 0.3223134667321982, "learning_rate": 8.488359242145182e-06, "loss": 0.3086, "num_tokens": 2328412418.0, "step": 8895 }, { "epoch": 4.1491841491841495, "grad_norm": 0.31791906605050724, "learning_rate": 8.469828319032555e-06, "loss": 0.3112, "num_tokens": 2329705306.0, "step": 8900 }, { "epoch": 4.151515151515151, "grad_norm": 0.3238691360164819, "learning_rate": 8.451342634035081e-06, "loss": 0.312, "num_tokens": 2331002662.0, "step": 8905 }, { "epoch": 4.153846153846154, "grad_norm": 0.32796231037465196, "learning_rate": 8.432902231096532e-06, "loss": 0.318, "num_tokens": 2332313382.0, "step": 8910 }, { "epoch": 4.156177156177156, "grad_norm": 0.33796829228728653, "learning_rate": 8.414507154053038e-06, "loss": 0.309, "num_tokens": 2333624102.0, "step": 8915 }, { "epoch": 4.158508158508159, "grad_norm": 0.32836861040158594, "learning_rate": 8.396157446632985e-06, "loss": 0.3019, "num_tokens": 2334934822.0, "step": 8920 }, { "epoch": 4.160839160839161, "grad_norm": 0.32684570489397824, "learning_rate": 8.3778531524569e-06, "loss": 0.312, "num_tokens": 2336245542.0, "step": 8925 }, { "epoch": 4.163170163170163, "grad_norm": 0.3318787549687187, "learning_rate": 8.359594315037348e-06, "loss": 0.3202, "num_tokens": 2337556262.0, "step": 8930 }, { "epoch": 4.165501165501166, "grad_norm": 0.33111122882180744, "learning_rate": 8.341380977778866e-06, "loss": 0.3155, "num_tokens": 2338853654.0, "step": 8935 }, { "epoch": 4.1678321678321675, "grad_norm": 0.3224813447044692, "learning_rate": 8.323213183977793e-06, "loss": 0.3091, "num_tokens": 2340164374.0, "step": 8940 }, { "epoch": 4.17016317016317, "grad_norm": 0.3456964708575295, "learning_rate": 8.305090976822214e-06, "loss": 0.31, "num_tokens": 2341462721.0, "step": 8945 }, { "epoch": 4.172494172494172, "grad_norm": 0.33107924062669025, "learning_rate": 8.287014399391866e-06, "loss": 0.3207, "num_tokens": 2342762061.0, "step": 8950 }, { "epoch": 4.174825174825175, "grad_norm": 0.33431120526499053, "learning_rate": 8.268983494657993e-06, "loss": 0.3179, "num_tokens": 2344072781.0, "step": 8955 }, { "epoch": 4.177156177156177, "grad_norm": 0.32952875537848103, "learning_rate": 8.250998305483268e-06, "loss": 0.306, "num_tokens": 2345376435.0, "step": 8960 }, { "epoch": 4.17948717948718, "grad_norm": 0.3456893653270103, "learning_rate": 8.233058874621704e-06, "loss": 0.326, "num_tokens": 2346687155.0, "step": 8965 }, { "epoch": 4.181818181818182, "grad_norm": 0.3394277090932486, "learning_rate": 8.215165244718532e-06, "loss": 0.3144, "num_tokens": 2347997875.0, "step": 8970 }, { "epoch": 4.1841491841491845, "grad_norm": 0.33738822320988965, "learning_rate": 8.197317458310092e-06, "loss": 0.3092, "num_tokens": 2349296210.0, "step": 8975 }, { "epoch": 4.186480186480186, "grad_norm": 0.34369988163488063, "learning_rate": 8.179515557823769e-06, "loss": 0.3109, "num_tokens": 2350606930.0, "step": 8980 }, { "epoch": 4.188811188811189, "grad_norm": 0.33185490990267685, "learning_rate": 8.161759585577863e-06, "loss": 0.3222, "num_tokens": 2351905894.0, "step": 8985 }, { "epoch": 4.191142191142191, "grad_norm": 0.33437248830096067, "learning_rate": 8.144049583781475e-06, "loss": 0.313, "num_tokens": 2353216614.0, "step": 8990 }, { "epoch": 4.193473193473194, "grad_norm": 0.32606670299996504, "learning_rate": 8.126385594534448e-06, "loss": 0.3155, "num_tokens": 2354527334.0, "step": 8995 }, { "epoch": 4.195804195804196, "grad_norm": 0.354696304621865, "learning_rate": 8.108767659827245e-06, "loss": 0.3019, "num_tokens": 2355838054.0, "step": 9000 }, { "epoch": 4.198135198135198, "grad_norm": 0.30886261035523754, "learning_rate": 8.09119582154083e-06, "loss": 0.3125, "num_tokens": 2357148774.0, "step": 9005 }, { "epoch": 4.200466200466201, "grad_norm": 0.32186205072669777, "learning_rate": 8.07367012144661e-06, "loss": 0.3102, "num_tokens": 2358459494.0, "step": 9010 }, { "epoch": 4.2027972027972025, "grad_norm": 0.32692083529916577, "learning_rate": 8.05619060120629e-06, "loss": 0.3097, "num_tokens": 2359770023.0, "step": 9015 }, { "epoch": 4.205128205128205, "grad_norm": 0.3536074622682776, "learning_rate": 8.038757302371816e-06, "loss": 0.3124, "num_tokens": 2361080743.0, "step": 9020 }, { "epoch": 4.207459207459207, "grad_norm": 0.3320417618080785, "learning_rate": 8.021370266385257e-06, "loss": 0.3143, "num_tokens": 2362384296.0, "step": 9025 }, { "epoch": 4.20979020979021, "grad_norm": 0.33236316161184004, "learning_rate": 8.004029534578694e-06, "loss": 0.3202, "num_tokens": 2363695016.0, "step": 9030 }, { "epoch": 4.212121212121212, "grad_norm": 0.3282580657954475, "learning_rate": 7.986735148174142e-06, "loss": 0.3102, "num_tokens": 2364989382.0, "step": 9035 }, { "epoch": 4.214452214452215, "grad_norm": 0.3449624050121814, "learning_rate": 7.969487148283451e-06, "loss": 0.3222, "num_tokens": 2366300102.0, "step": 9040 }, { "epoch": 4.216783216783217, "grad_norm": 0.33015411004128303, "learning_rate": 7.95228557590819e-06, "loss": 0.3189, "num_tokens": 2367610822.0, "step": 9045 }, { "epoch": 4.2191142191142195, "grad_norm": 0.3482626079276277, "learning_rate": 7.935130471939572e-06, "loss": 0.319, "num_tokens": 2368919557.0, "step": 9050 }, { "epoch": 4.221445221445221, "grad_norm": 0.3313155428462869, "learning_rate": 7.918021877158333e-06, "loss": 0.3229, "num_tokens": 2370230277.0, "step": 9055 }, { "epoch": 4.223776223776224, "grad_norm": 0.3286320237256984, "learning_rate": 7.900959832234667e-06, "loss": 0.315, "num_tokens": 2371540997.0, "step": 9060 }, { "epoch": 4.226107226107226, "grad_norm": 0.32781843149335, "learning_rate": 7.883944377728091e-06, "loss": 0.3168, "num_tokens": 2372851717.0, "step": 9065 }, { "epoch": 4.228438228438228, "grad_norm": 0.3199378658171041, "learning_rate": 7.866975554087384e-06, "loss": 0.3196, "num_tokens": 2374154156.0, "step": 9070 }, { "epoch": 4.230769230769231, "grad_norm": 0.32318125321218416, "learning_rate": 7.85005340165047e-06, "loss": 0.3109, "num_tokens": 2375464876.0, "step": 9075 }, { "epoch": 4.233100233100233, "grad_norm": 0.32948239675038016, "learning_rate": 7.833177960644318e-06, "loss": 0.3149, "num_tokens": 2376775596.0, "step": 9080 }, { "epoch": 4.235431235431236, "grad_norm": 0.330342035385706, "learning_rate": 7.816349271184873e-06, "loss": 0.3228, "num_tokens": 2378072943.0, "step": 9085 }, { "epoch": 4.2377622377622375, "grad_norm": 0.3343981894720827, "learning_rate": 7.79956737327693e-06, "loss": 0.3272, "num_tokens": 2379383663.0, "step": 9090 }, { "epoch": 4.24009324009324, "grad_norm": 0.3370576768884812, "learning_rate": 7.782832306814055e-06, "loss": 0.3215, "num_tokens": 2380694383.0, "step": 9095 }, { "epoch": 4.242424242424242, "grad_norm": 0.34055936569913503, "learning_rate": 7.766144111578488e-06, "loss": 0.3154, "num_tokens": 2382005103.0, "step": 9100 }, { "epoch": 4.244755244755245, "grad_norm": 0.3250618537313876, "learning_rate": 7.749502827241053e-06, "loss": 0.3054, "num_tokens": 2383315823.0, "step": 9105 }, { "epoch": 4.247086247086247, "grad_norm": 0.33979822723341285, "learning_rate": 7.732908493361054e-06, "loss": 0.3131, "num_tokens": 2384626543.0, "step": 9110 }, { "epoch": 4.24941724941725, "grad_norm": 0.3448739247944029, "learning_rate": 7.716361149386169e-06, "loss": 0.3154, "num_tokens": 2385937263.0, "step": 9115 }, { "epoch": 4.251748251748252, "grad_norm": 0.3513942678682862, "learning_rate": 7.69986083465241e-06, "loss": 0.3161, "num_tokens": 2387247983.0, "step": 9120 }, { "epoch": 4.2540792540792545, "grad_norm": 0.33283021075110353, "learning_rate": 7.68340758838396e-06, "loss": 0.3138, "num_tokens": 2388558703.0, "step": 9125 }, { "epoch": 4.256410256410256, "grad_norm": 0.3241327933794044, "learning_rate": 7.667001449693118e-06, "loss": 0.3073, "num_tokens": 2389869423.0, "step": 9130 }, { "epoch": 4.258741258741258, "grad_norm": 0.33663247742640745, "learning_rate": 7.650642457580216e-06, "loss": 0.3245, "num_tokens": 2391180143.0, "step": 9135 }, { "epoch": 4.261072261072261, "grad_norm": 0.35167952703734645, "learning_rate": 7.634330650933491e-06, "loss": 0.318, "num_tokens": 2392490863.0, "step": 9140 }, { "epoch": 4.263403263403263, "grad_norm": 0.3458772358952002, "learning_rate": 7.618066068529013e-06, "loss": 0.3217, "num_tokens": 2393801583.0, "step": 9145 }, { "epoch": 4.265734265734266, "grad_norm": 0.33672532361513857, "learning_rate": 7.601848749030614e-06, "loss": 0.3153, "num_tokens": 2395112303.0, "step": 9150 }, { "epoch": 4.268065268065268, "grad_norm": 0.3412841871135003, "learning_rate": 7.5856787309897485e-06, "loss": 0.3129, "num_tokens": 2396423023.0, "step": 9155 }, { "epoch": 4.270396270396271, "grad_norm": 0.34739331638218984, "learning_rate": 7.5695560528454335e-06, "loss": 0.3268, "num_tokens": 2397733743.0, "step": 9160 }, { "epoch": 4.2727272727272725, "grad_norm": 0.32838660057211366, "learning_rate": 7.553480752924152e-06, "loss": 0.3176, "num_tokens": 2399044463.0, "step": 9165 }, { "epoch": 4.275058275058275, "grad_norm": 0.34100398627810036, "learning_rate": 7.537452869439773e-06, "loss": 0.3238, "num_tokens": 2400355183.0, "step": 9170 }, { "epoch": 4.277389277389277, "grad_norm": 0.34349089530086746, "learning_rate": 7.521472440493424e-06, "loss": 0.3241, "num_tokens": 2401665903.0, "step": 9175 }, { "epoch": 4.27972027972028, "grad_norm": 0.32237561114424745, "learning_rate": 7.5055395040734375e-06, "loss": 0.31, "num_tokens": 2402964469.0, "step": 9180 }, { "epoch": 4.282051282051282, "grad_norm": 0.34307510366114763, "learning_rate": 7.489654098055261e-06, "loss": 0.3307, "num_tokens": 2404275189.0, "step": 9185 }, { "epoch": 4.284382284382285, "grad_norm": 0.3541210255290945, "learning_rate": 7.473816260201326e-06, "loss": 0.3213, "num_tokens": 2405585909.0, "step": 9190 }, { "epoch": 4.286713286713287, "grad_norm": 0.33830233519487823, "learning_rate": 7.458026028161005e-06, "loss": 0.3171, "num_tokens": 2406871865.0, "step": 9195 }, { "epoch": 4.2890442890442895, "grad_norm": 0.3280850693643302, "learning_rate": 7.442283439470503e-06, "loss": 0.3234, "num_tokens": 2408182585.0, "step": 9200 }, { "epoch": 4.291375291375291, "grad_norm": 0.34250336046966473, "learning_rate": 7.426588531552755e-06, "loss": 0.3188, "num_tokens": 2409488739.0, "step": 9205 }, { "epoch": 4.293706293706293, "grad_norm": 0.33353326558959956, "learning_rate": 7.4109413417173645e-06, "loss": 0.3162, "num_tokens": 2410799459.0, "step": 9210 }, { "epoch": 4.296037296037296, "grad_norm": 0.32332388680356194, "learning_rate": 7.3953419071604965e-06, "loss": 0.3229, "num_tokens": 2412110179.0, "step": 9215 }, { "epoch": 4.298368298368298, "grad_norm": 0.3240810807807097, "learning_rate": 7.379790264964787e-06, "loss": 0.3071, "num_tokens": 2413420899.0, "step": 9220 }, { "epoch": 4.300699300699301, "grad_norm": 0.33571497946632756, "learning_rate": 7.364286452099268e-06, "loss": 0.3247, "num_tokens": 2414731619.0, "step": 9225 }, { "epoch": 4.303030303030303, "grad_norm": 0.3156999850446858, "learning_rate": 7.348830505419266e-06, "loss": 0.3078, "num_tokens": 2416042339.0, "step": 9230 }, { "epoch": 4.305361305361306, "grad_norm": 0.31560747105000503, "learning_rate": 7.333422461666334e-06, "loss": 0.3079, "num_tokens": 2417352288.0, "step": 9235 }, { "epoch": 4.3076923076923075, "grad_norm": 0.31764759996906916, "learning_rate": 7.318062357468133e-06, "loss": 0.3109, "num_tokens": 2418663008.0, "step": 9240 }, { "epoch": 4.31002331002331, "grad_norm": 0.33954512734132447, "learning_rate": 7.302750229338377e-06, "loss": 0.3141, "num_tokens": 2419973728.0, "step": 9245 }, { "epoch": 4.312354312354312, "grad_norm": 0.33905444936733614, "learning_rate": 7.287486113676732e-06, "loss": 0.3055, "num_tokens": 2421284448.0, "step": 9250 }, { "epoch": 4.314685314685315, "grad_norm": 0.3516319488679377, "learning_rate": 7.272270046768719e-06, "loss": 0.3229, "num_tokens": 2422595168.0, "step": 9255 }, { "epoch": 4.317016317016317, "grad_norm": 0.31543886841356666, "learning_rate": 7.257102064785647e-06, "loss": 0.3026, "num_tokens": 2423905888.0, "step": 9260 }, { "epoch": 4.31934731934732, "grad_norm": 0.33654187527085266, "learning_rate": 7.241982203784521e-06, "loss": 0.316, "num_tokens": 2425216608.0, "step": 9265 }, { "epoch": 4.321678321678322, "grad_norm": 0.3453335987898658, "learning_rate": 7.226910499707942e-06, "loss": 0.3213, "num_tokens": 2426527328.0, "step": 9270 }, { "epoch": 4.3240093240093245, "grad_norm": 0.31972916985922056, "learning_rate": 7.211886988384051e-06, "loss": 0.3141, "num_tokens": 2427838048.0, "step": 9275 }, { "epoch": 4.326340326340326, "grad_norm": 0.31446079269320515, "learning_rate": 7.196911705526405e-06, "loss": 0.3161, "num_tokens": 2429148768.0, "step": 9280 }, { "epoch": 4.328671328671328, "grad_norm": 0.31884066154984625, "learning_rate": 7.181984686733929e-06, "loss": 0.3059, "num_tokens": 2430459488.0, "step": 9285 }, { "epoch": 4.331002331002331, "grad_norm": 0.338313943294416, "learning_rate": 7.167105967490818e-06, "loss": 0.3104, "num_tokens": 2431770208.0, "step": 9290 }, { "epoch": 4.333333333333333, "grad_norm": 0.3434119449215798, "learning_rate": 7.1522755831664345e-06, "loss": 0.3128, "num_tokens": 2433080928.0, "step": 9295 }, { "epoch": 4.335664335664336, "grad_norm": 0.31426173762807547, "learning_rate": 7.137493569015252e-06, "loss": 0.3136, "num_tokens": 2434388313.0, "step": 9300 }, { "epoch": 4.337995337995338, "grad_norm": 0.33860929397786954, "learning_rate": 7.122759960176764e-06, "loss": 0.316, "num_tokens": 2435699033.0, "step": 9305 }, { "epoch": 4.340326340326341, "grad_norm": 0.32512758447513196, "learning_rate": 7.108074791675377e-06, "loss": 0.3276, "num_tokens": 2437009753.0, "step": 9310 }, { "epoch": 4.3426573426573425, "grad_norm": 0.3408691365223931, "learning_rate": 7.093438098420364e-06, "loss": 0.3111, "num_tokens": 2438320473.0, "step": 9315 }, { "epoch": 4.344988344988345, "grad_norm": 10.7223619115978, "learning_rate": 7.078849915205761e-06, "loss": 0.3984, "num_tokens": 2439624916.0, "step": 9320 }, { "epoch": 4.347319347319347, "grad_norm": 0.3465031175218188, "learning_rate": 7.06431027671028e-06, "loss": 0.3282, "num_tokens": 2440935636.0, "step": 9325 }, { "epoch": 4.34965034965035, "grad_norm": 0.34298399330314866, "learning_rate": 7.049819217497229e-06, "loss": 0.3151, "num_tokens": 2442246356.0, "step": 9330 }, { "epoch": 4.351981351981352, "grad_norm": 0.32774175918296494, "learning_rate": 7.0353767720144585e-06, "loss": 0.311, "num_tokens": 2443557076.0, "step": 9335 }, { "epoch": 4.354312354312354, "grad_norm": 0.3445401287963301, "learning_rate": 7.020982974594234e-06, "loss": 0.3177, "num_tokens": 2444867796.0, "step": 9340 }, { "epoch": 4.356643356643357, "grad_norm": 0.33840413843763606, "learning_rate": 7.006637859453166e-06, "loss": 0.3175, "num_tokens": 2446166667.0, "step": 9345 }, { "epoch": 4.358974358974359, "grad_norm": 0.3442185853976276, "learning_rate": 6.99234146069218e-06, "loss": 0.3285, "num_tokens": 2447477387.0, "step": 9350 }, { "epoch": 4.361305361305361, "grad_norm": 0.32304618994576395, "learning_rate": 6.978093812296353e-06, "loss": 0.3241, "num_tokens": 2448788107.0, "step": 9355 }, { "epoch": 4.363636363636363, "grad_norm": 0.3371003210685687, "learning_rate": 6.963894948134886e-06, "loss": 0.3153, "num_tokens": 2450098827.0, "step": 9360 }, { "epoch": 4.365967365967366, "grad_norm": 0.32770902047531997, "learning_rate": 6.949744901961018e-06, "loss": 0.3205, "num_tokens": 2451409547.0, "step": 9365 }, { "epoch": 4.368298368298368, "grad_norm": 0.3316976289164916, "learning_rate": 6.935643707411941e-06, "loss": 0.3181, "num_tokens": 2452715351.0, "step": 9370 }, { "epoch": 4.370629370629371, "grad_norm": 0.3297586670212026, "learning_rate": 6.9215913980087e-06, "loss": 0.3127, "num_tokens": 2454026071.0, "step": 9375 }, { "epoch": 4.372960372960373, "grad_norm": 0.34164744331202934, "learning_rate": 6.907588007156147e-06, "loss": 0.3167, "num_tokens": 2455323641.0, "step": 9380 }, { "epoch": 4.375291375291376, "grad_norm": 0.3325814425613154, "learning_rate": 6.893633568142849e-06, "loss": 0.3115, "num_tokens": 2456634361.0, "step": 9385 }, { "epoch": 4.3776223776223775, "grad_norm": 0.358839160865776, "learning_rate": 6.87972811414099e-06, "loss": 0.3007, "num_tokens": 2457945081.0, "step": 9390 }, { "epoch": 4.37995337995338, "grad_norm": 0.34252531902628414, "learning_rate": 6.865871678206317e-06, "loss": 0.3189, "num_tokens": 2459255801.0, "step": 9395 }, { "epoch": 4.382284382284382, "grad_norm": 0.3241607534033865, "learning_rate": 6.85206429327806e-06, "loss": 0.3063, "num_tokens": 2460566521.0, "step": 9400 }, { "epoch": 4.384615384615385, "grad_norm": 0.32915418617519726, "learning_rate": 6.838305992178824e-06, "loss": 0.3181, "num_tokens": 2461877241.0, "step": 9405 }, { "epoch": 4.386946386946387, "grad_norm": 0.3252694583787816, "learning_rate": 6.824596807614559e-06, "loss": 0.3115, "num_tokens": 2463187961.0, "step": 9410 }, { "epoch": 4.389277389277389, "grad_norm": 0.3293083654300197, "learning_rate": 6.810936772174439e-06, "loss": 0.3235, "num_tokens": 2464495546.0, "step": 9415 }, { "epoch": 4.391608391608392, "grad_norm": 0.3542236093862788, "learning_rate": 6.797325918330806e-06, "loss": 0.3032, "num_tokens": 2465796046.0, "step": 9420 }, { "epoch": 4.393939393939394, "grad_norm": 0.3330424585038265, "learning_rate": 6.783764278439092e-06, "loss": 0.3112, "num_tokens": 2467106766.0, "step": 9425 }, { "epoch": 4.396270396270396, "grad_norm": 0.34014850867321345, "learning_rate": 6.77025188473773e-06, "loss": 0.3108, "num_tokens": 2468417486.0, "step": 9430 }, { "epoch": 4.398601398601398, "grad_norm": 0.3325831324351841, "learning_rate": 6.756788769348103e-06, "loss": 0.3189, "num_tokens": 2469728206.0, "step": 9435 }, { "epoch": 4.400932400932401, "grad_norm": 0.33398045142731836, "learning_rate": 6.743374964274427e-06, "loss": 0.3212, "num_tokens": 2471038926.0, "step": 9440 }, { "epoch": 4.403263403263403, "grad_norm": 0.3149622560196894, "learning_rate": 6.730010501403718e-06, "loss": 0.3103, "num_tokens": 2472349646.0, "step": 9445 }, { "epoch": 4.405594405594406, "grad_norm": 0.3296632544724334, "learning_rate": 6.716695412505688e-06, "loss": 0.3141, "num_tokens": 2473660366.0, "step": 9450 }, { "epoch": 4.407925407925408, "grad_norm": 0.3266017858365408, "learning_rate": 6.703429729232682e-06, "loss": 0.3195, "num_tokens": 2474971086.0, "step": 9455 }, { "epoch": 4.410256410256411, "grad_norm": 0.3373805559516959, "learning_rate": 6.690213483119595e-06, "loss": 0.312, "num_tokens": 2476281806.0, "step": 9460 }, { "epoch": 4.4125874125874125, "grad_norm": 0.3366721023486427, "learning_rate": 6.677046705583806e-06, "loss": 0.3171, "num_tokens": 2477592526.0, "step": 9465 }, { "epoch": 4.414918414918415, "grad_norm": 0.3310996468709432, "learning_rate": 6.663929427925095e-06, "loss": 0.3054, "num_tokens": 2478903246.0, "step": 9470 }, { "epoch": 4.417249417249417, "grad_norm": 0.3237625603637502, "learning_rate": 6.650861681325567e-06, "loss": 0.3063, "num_tokens": 2480204978.0, "step": 9475 }, { "epoch": 4.41958041958042, "grad_norm": 0.35443174214884327, "learning_rate": 6.6378434968495965e-06, "loss": 0.3186, "num_tokens": 2481515698.0, "step": 9480 }, { "epoch": 4.421911421911422, "grad_norm": 0.34643582807007817, "learning_rate": 6.624874905443726e-06, "loss": 0.3104, "num_tokens": 2482810080.0, "step": 9485 }, { "epoch": 4.424242424242424, "grad_norm": 0.3353431088763468, "learning_rate": 6.611955937936619e-06, "loss": 0.3042, "num_tokens": 2484095674.0, "step": 9490 }, { "epoch": 4.426573426573427, "grad_norm": 0.3242056478567321, "learning_rate": 6.599086625038957e-06, "loss": 0.32, "num_tokens": 2485406394.0, "step": 9495 }, { "epoch": 4.428904428904429, "grad_norm": 0.3331241038387844, "learning_rate": 6.586266997343402e-06, "loss": 0.3078, "num_tokens": 2486697670.0, "step": 9500 }, { "epoch": 4.431235431235431, "grad_norm": 0.3288847500668807, "learning_rate": 6.5734970853244985e-06, "loss": 0.3095, "num_tokens": 2488008390.0, "step": 9505 }, { "epoch": 4.433566433566433, "grad_norm": 0.3444690042268666, "learning_rate": 6.560776919338599e-06, "loss": 0.3171, "num_tokens": 2489319110.0, "step": 9510 }, { "epoch": 4.435897435897436, "grad_norm": 0.3401045593755526, "learning_rate": 6.5481065296238155e-06, "loss": 0.3233, "num_tokens": 2490621224.0, "step": 9515 }, { "epoch": 4.438228438228438, "grad_norm": 0.32485207168584423, "learning_rate": 6.535485946299927e-06, "loss": 0.3, "num_tokens": 2491931944.0, "step": 9520 }, { "epoch": 4.440559440559441, "grad_norm": 0.3362671229182379, "learning_rate": 6.5229151993683065e-06, "loss": 0.3231, "num_tokens": 2493242664.0, "step": 9525 }, { "epoch": 4.442890442890443, "grad_norm": 0.3188938856125156, "learning_rate": 6.5103943187118654e-06, "loss": 0.3248, "num_tokens": 2494553384.0, "step": 9530 }, { "epoch": 4.445221445221446, "grad_norm": 0.3206644304295667, "learning_rate": 6.49792333409498e-06, "loss": 0.3193, "num_tokens": 2495864104.0, "step": 9535 }, { "epoch": 4.4475524475524475, "grad_norm": 0.32725720854706297, "learning_rate": 6.485502275163401e-06, "loss": 0.3128, "num_tokens": 2497174824.0, "step": 9540 }, { "epoch": 4.449883449883449, "grad_norm": 0.3377685213956361, "learning_rate": 6.473131171444192e-06, "loss": 0.3098, "num_tokens": 2498485544.0, "step": 9545 }, { "epoch": 4.452214452214452, "grad_norm": 0.3267131751428466, "learning_rate": 6.460810052345697e-06, "loss": 0.3122, "num_tokens": 2499796264.0, "step": 9550 }, { "epoch": 4.454545454545454, "grad_norm": 0.3369467611573572, "learning_rate": 6.4485389471574025e-06, "loss": 0.3121, "num_tokens": 2501097144.0, "step": 9555 }, { "epoch": 4.456876456876457, "grad_norm": 0.31471698424983463, "learning_rate": 6.4363178850499115e-06, "loss": 0.3114, "num_tokens": 2502407864.0, "step": 9560 }, { "epoch": 4.459207459207459, "grad_norm": 0.3182422570183859, "learning_rate": 6.424146895074878e-06, "loss": 0.3217, "num_tokens": 2503718584.0, "step": 9565 }, { "epoch": 4.461538461538462, "grad_norm": 0.32824460237041364, "learning_rate": 6.41202600616492e-06, "loss": 0.312, "num_tokens": 2505029304.0, "step": 9570 }, { "epoch": 4.463869463869464, "grad_norm": 0.3276454341355643, "learning_rate": 6.399955247133547e-06, "loss": 0.3233, "num_tokens": 2506340024.0, "step": 9575 }, { "epoch": 4.466200466200466, "grad_norm": 0.35551047742829733, "learning_rate": 6.387934646675109e-06, "loss": 0.3172, "num_tokens": 2507650744.0, "step": 9580 }, { "epoch": 4.468531468531468, "grad_norm": 0.33605570429689574, "learning_rate": 6.375964233364725e-06, "loss": 0.3353, "num_tokens": 2508961464.0, "step": 9585 }, { "epoch": 4.470862470862471, "grad_norm": 0.3172835951855472, "learning_rate": 6.364044035658198e-06, "loss": 0.3063, "num_tokens": 2510272184.0, "step": 9590 }, { "epoch": 4.473193473193473, "grad_norm": 0.3207971543619864, "learning_rate": 6.352174081891969e-06, "loss": 0.3132, "num_tokens": 2511582904.0, "step": 9595 }, { "epoch": 4.475524475524476, "grad_norm": 0.31751145509697243, "learning_rate": 6.340354400283039e-06, "loss": 0.3107, "num_tokens": 2512893624.0, "step": 9600 }, { "epoch": 4.477855477855478, "grad_norm": 0.3384772695617782, "learning_rate": 6.328585018928896e-06, "loss": 0.3239, "num_tokens": 2514204344.0, "step": 9605 }, { "epoch": 4.480186480186481, "grad_norm": 0.34136380265968547, "learning_rate": 6.31686596580746e-06, "loss": 0.3159, "num_tokens": 2515515064.0, "step": 9610 }, { "epoch": 4.4825174825174825, "grad_norm": 0.34265867835608826, "learning_rate": 6.305197268777023e-06, "loss": 0.3232, "num_tokens": 2516804613.0, "step": 9615 }, { "epoch": 4.484848484848484, "grad_norm": 0.3471232349713498, "learning_rate": 6.293578955576149e-06, "loss": 0.3162, "num_tokens": 2518115333.0, "step": 9620 }, { "epoch": 4.487179487179487, "grad_norm": 0.33307127442521534, "learning_rate": 6.28201105382364e-06, "loss": 0.3196, "num_tokens": 2519426053.0, "step": 9625 }, { "epoch": 4.489510489510489, "grad_norm": 0.33359074202000116, "learning_rate": 6.2704935910184785e-06, "loss": 0.3136, "num_tokens": 2520736773.0, "step": 9630 }, { "epoch": 4.491841491841492, "grad_norm": 0.3240069604325885, "learning_rate": 6.259026594539719e-06, "loss": 0.3188, "num_tokens": 2522047493.0, "step": 9635 }, { "epoch": 4.494172494172494, "grad_norm": 0.3388605125051464, "learning_rate": 6.2476100916464585e-06, "loss": 0.3154, "num_tokens": 2523358213.0, "step": 9640 }, { "epoch": 4.496503496503497, "grad_norm": 0.32541913779560644, "learning_rate": 6.236244109477764e-06, "loss": 0.3197, "num_tokens": 2524663045.0, "step": 9645 }, { "epoch": 4.498834498834499, "grad_norm": 0.33945252106405477, "learning_rate": 6.224928675052609e-06, "loss": 0.3211, "num_tokens": 2525973765.0, "step": 9650 }, { "epoch": 4.501165501165501, "grad_norm": 0.3476363419781912, "learning_rate": 6.213663815269794e-06, "loss": 0.3079, "num_tokens": 2527279335.0, "step": 9655 }, { "epoch": 4.503496503496503, "grad_norm": 0.33583724777887775, "learning_rate": 6.202449556907903e-06, "loss": 0.325, "num_tokens": 2528590055.0, "step": 9660 }, { "epoch": 4.505827505827506, "grad_norm": 0.3216459701800872, "learning_rate": 6.191285926625236e-06, "loss": 0.3106, "num_tokens": 2529900775.0, "step": 9665 }, { "epoch": 4.508158508158508, "grad_norm": 0.3346538262362633, "learning_rate": 6.180172950959726e-06, "loss": 0.3161, "num_tokens": 2531211495.0, "step": 9670 }, { "epoch": 4.510489510489511, "grad_norm": 0.33904717924175304, "learning_rate": 6.169110656328905e-06, "loss": 0.3256, "num_tokens": 2532522215.0, "step": 9675 }, { "epoch": 4.512820512820513, "grad_norm": 0.31774716560986643, "learning_rate": 6.158099069029825e-06, "loss": 0.3101, "num_tokens": 2533832935.0, "step": 9680 }, { "epoch": 4.515151515151516, "grad_norm": 0.3219903613779173, "learning_rate": 6.147138215238987e-06, "loss": 0.3175, "num_tokens": 2535143655.0, "step": 9685 }, { "epoch": 4.5174825174825175, "grad_norm": 0.3270086455520368, "learning_rate": 6.136228121012301e-06, "loss": 0.3025, "num_tokens": 2536454375.0, "step": 9690 }, { "epoch": 4.519813519813519, "grad_norm": 0.32790806083662694, "learning_rate": 6.125368812285014e-06, "loss": 0.324, "num_tokens": 2537765095.0, "step": 9695 }, { "epoch": 4.522144522144522, "grad_norm": 0.3374631769629436, "learning_rate": 6.11456031487163e-06, "loss": 0.3113, "num_tokens": 2539075815.0, "step": 9700 }, { "epoch": 4.524475524475524, "grad_norm": 0.3165604361693966, "learning_rate": 6.103802654465887e-06, "loss": 0.3189, "num_tokens": 2540386535.0, "step": 9705 }, { "epoch": 4.526806526806527, "grad_norm": 0.33329264324814467, "learning_rate": 6.093095856640659e-06, "loss": 0.3267, "num_tokens": 2541697255.0, "step": 9710 }, { "epoch": 4.529137529137529, "grad_norm": 0.318747910398136, "learning_rate": 6.082439946847914e-06, "loss": 0.3152, "num_tokens": 2543007975.0, "step": 9715 }, { "epoch": 4.531468531468532, "grad_norm": 0.3395245128133879, "learning_rate": 6.0718349504186596e-06, "loss": 0.3177, "num_tokens": 2544305294.0, "step": 9720 }, { "epoch": 4.533799533799534, "grad_norm": 0.3234654794073021, "learning_rate": 6.061280892562856e-06, "loss": 0.313, "num_tokens": 2545614275.0, "step": 9725 }, { "epoch": 4.536130536130536, "grad_norm": 0.3230472732102473, "learning_rate": 6.050777798369387e-06, "loss": 0.3145, "num_tokens": 2546924995.0, "step": 9730 }, { "epoch": 4.538461538461538, "grad_norm": 0.32753670861955114, "learning_rate": 6.040325692805984e-06, "loss": 0.3119, "num_tokens": 2548235715.0, "step": 9735 }, { "epoch": 4.540792540792541, "grad_norm": 0.3312438687676759, "learning_rate": 6.029924600719165e-06, "loss": 0.3168, "num_tokens": 2549546435.0, "step": 9740 }, { "epoch": 4.543123543123543, "grad_norm": 0.34207396319559835, "learning_rate": 6.019574546834186e-06, "loss": 0.329, "num_tokens": 2550857155.0, "step": 9745 }, { "epoch": 4.545454545454545, "grad_norm": 0.32384193950890633, "learning_rate": 6.009275555754967e-06, "loss": 0.3133, "num_tokens": 2552167875.0, "step": 9750 }, { "epoch": 4.547785547785548, "grad_norm": 0.33222986120067743, "learning_rate": 5.999027651964054e-06, "loss": 0.3178, "num_tokens": 2553478595.0, "step": 9755 }, { "epoch": 4.550116550116551, "grad_norm": 0.3394591599285521, "learning_rate": 5.988830859822541e-06, "loss": 0.3106, "num_tokens": 2554789315.0, "step": 9760 }, { "epoch": 4.5524475524475525, "grad_norm": 0.3273894065028813, "learning_rate": 5.978685203570021e-06, "loss": 0.3109, "num_tokens": 2556100035.0, "step": 9765 }, { "epoch": 4.554778554778554, "grad_norm": 0.3441663270198807, "learning_rate": 5.968590707324535e-06, "loss": 0.3214, "num_tokens": 2557410755.0, "step": 9770 }, { "epoch": 4.557109557109557, "grad_norm": 0.3305110868532999, "learning_rate": 5.958547395082498e-06, "loss": 0.3214, "num_tokens": 2558721475.0, "step": 9775 }, { "epoch": 4.559440559440559, "grad_norm": 0.3318319061404925, "learning_rate": 5.948555290718658e-06, "loss": 0.3203, "num_tokens": 2560032195.0, "step": 9780 }, { "epoch": 4.561771561771562, "grad_norm": 0.3143112654235783, "learning_rate": 5.938614417986035e-06, "loss": 0.3238, "num_tokens": 2561342915.0, "step": 9785 }, { "epoch": 4.564102564102564, "grad_norm": 0.33345941184977745, "learning_rate": 5.928724800515848e-06, "loss": 0.3143, "num_tokens": 2562653635.0, "step": 9790 }, { "epoch": 4.566433566433567, "grad_norm": 0.34686197219373827, "learning_rate": 5.91888646181749e-06, "loss": 0.3137, "num_tokens": 2563948407.0, "step": 9795 }, { "epoch": 4.568764568764569, "grad_norm": 0.33117506317785395, "learning_rate": 5.909099425278451e-06, "loss": 0.32, "num_tokens": 2565259127.0, "step": 9800 }, { "epoch": 4.571095571095571, "grad_norm": 0.3435052314461775, "learning_rate": 5.899363714164259e-06, "loss": 0.3148, "num_tokens": 2566569847.0, "step": 9805 }, { "epoch": 4.573426573426573, "grad_norm": 0.3511413949641888, "learning_rate": 5.889679351618435e-06, "loss": 0.3239, "num_tokens": 2567880567.0, "step": 9810 }, { "epoch": 4.575757575757576, "grad_norm": 0.3277981153602279, "learning_rate": 5.880046360662442e-06, "loss": 0.319, "num_tokens": 2569191287.0, "step": 9815 }, { "epoch": 4.578088578088578, "grad_norm": 0.34041898778575075, "learning_rate": 5.870464764195621e-06, "loss": 0.3117, "num_tokens": 2570502007.0, "step": 9820 }, { "epoch": 4.58041958041958, "grad_norm": 0.3243193311955865, "learning_rate": 5.8609345849951275e-06, "loss": 0.312, "num_tokens": 2571812727.0, "step": 9825 }, { "epoch": 4.582750582750583, "grad_norm": 0.3310866370673146, "learning_rate": 5.851455845715912e-06, "loss": 0.3109, "num_tokens": 2573123447.0, "step": 9830 }, { "epoch": 4.585081585081585, "grad_norm": 0.31513987342632316, "learning_rate": 5.842028568890624e-06, "loss": 0.3069, "num_tokens": 2574434167.0, "step": 9835 }, { "epoch": 4.5874125874125875, "grad_norm": 0.3275061402794141, "learning_rate": 5.832652776929576e-06, "loss": 0.3048, "num_tokens": 2575744887.0, "step": 9840 }, { "epoch": 4.589743589743589, "grad_norm": 0.33530078415240044, "learning_rate": 5.823328492120709e-06, "loss": 0.3205, "num_tokens": 2577055607.0, "step": 9845 }, { "epoch": 4.592074592074592, "grad_norm": 0.3482685265210104, "learning_rate": 5.814055736629512e-06, "loss": 0.3222, "num_tokens": 2578366327.0, "step": 9850 }, { "epoch": 4.594405594405594, "grad_norm": 0.3537893890025293, "learning_rate": 5.804834532498973e-06, "loss": 0.3125, "num_tokens": 2579662506.0, "step": 9855 }, { "epoch": 4.596736596736597, "grad_norm": 0.3358652150747448, "learning_rate": 5.795664901649546e-06, "loss": 0.3123, "num_tokens": 2580973226.0, "step": 9860 }, { "epoch": 4.599067599067599, "grad_norm": 0.35598478994424876, "learning_rate": 5.78654686587908e-06, "loss": 0.3333, "num_tokens": 2582283946.0, "step": 9865 }, { "epoch": 4.601398601398602, "grad_norm": 0.331525987147412, "learning_rate": 5.777480446862771e-06, "loss": 0.3199, "num_tokens": 2583594666.0, "step": 9870 }, { "epoch": 4.603729603729604, "grad_norm": 0.32178290426004424, "learning_rate": 5.768465666153116e-06, "loss": 0.3289, "num_tokens": 2584905386.0, "step": 9875 }, { "epoch": 4.606060606060606, "grad_norm": 0.30393892105397075, "learning_rate": 5.759502545179865e-06, "loss": 0.3076, "num_tokens": 2586200553.0, "step": 9880 }, { "epoch": 4.608391608391608, "grad_norm": 0.32968332333326905, "learning_rate": 5.750591105249945e-06, "loss": 0.3105, "num_tokens": 2587511273.0, "step": 9885 }, { "epoch": 4.610722610722611, "grad_norm": 0.3231640929433455, "learning_rate": 5.741731367547445e-06, "loss": 0.3175, "num_tokens": 2588821993.0, "step": 9890 }, { "epoch": 4.613053613053613, "grad_norm": 0.31411918328374727, "learning_rate": 5.732923353133545e-06, "loss": 0.3102, "num_tokens": 2590132713.0, "step": 9895 }, { "epoch": 4.615384615384615, "grad_norm": 0.335002070080694, "learning_rate": 5.724167082946466e-06, "loss": 0.3225, "num_tokens": 2591443433.0, "step": 9900 }, { "epoch": 4.617715617715618, "grad_norm": 0.3229052741900776, "learning_rate": 5.715462577801427e-06, "loss": 0.3156, "num_tokens": 2592754153.0, "step": 9905 }, { "epoch": 4.62004662004662, "grad_norm": 0.3297660710371224, "learning_rate": 5.706809858390583e-06, "loss": 0.3276, "num_tokens": 2594060378.0, "step": 9910 }, { "epoch": 4.6223776223776225, "grad_norm": 0.3155783908720556, "learning_rate": 5.698208945283e-06, "loss": 0.2992, "num_tokens": 2595371098.0, "step": 9915 }, { "epoch": 4.624708624708624, "grad_norm": 0.3193903370909751, "learning_rate": 5.689659858924586e-06, "loss": 0.3068, "num_tokens": 2596681818.0, "step": 9920 }, { "epoch": 4.627039627039627, "grad_norm": 0.33774842672012795, "learning_rate": 5.6811626196380385e-06, "loss": 0.3121, "num_tokens": 2597992538.0, "step": 9925 }, { "epoch": 4.629370629370629, "grad_norm": 0.33740225064211665, "learning_rate": 5.672717247622816e-06, "loss": 0.3102, "num_tokens": 2599303258.0, "step": 9930 }, { "epoch": 4.631701631701632, "grad_norm": 0.3373159671469985, "learning_rate": 5.664323762955072e-06, "loss": 0.3221, "num_tokens": 2600613978.0, "step": 9935 }, { "epoch": 4.634032634032634, "grad_norm": 0.35331673034205946, "learning_rate": 5.655982185587621e-06, "loss": 0.3184, "num_tokens": 2601924698.0, "step": 9940 }, { "epoch": 4.636363636363637, "grad_norm": 0.34075562342066007, "learning_rate": 5.647692535349884e-06, "loss": 0.3176, "num_tokens": 2603235418.0, "step": 9945 }, { "epoch": 4.638694638694639, "grad_norm": 0.3272741009052787, "learning_rate": 5.6394548319478325e-06, "loss": 0.308, "num_tokens": 2604546138.0, "step": 9950 }, { "epoch": 4.641025641025641, "grad_norm": 0.3353878917152132, "learning_rate": 5.631269094963962e-06, "loss": 0.3132, "num_tokens": 2605856858.0, "step": 9955 }, { "epoch": 4.643356643356643, "grad_norm": 0.335095370367202, "learning_rate": 5.623135343857232e-06, "loss": 0.3179, "num_tokens": 2607167578.0, "step": 9960 }, { "epoch": 4.645687645687646, "grad_norm": 0.3448209805418296, "learning_rate": 5.615053597963018e-06, "loss": 0.3266, "num_tokens": 2608468942.0, "step": 9965 }, { "epoch": 4.648018648018648, "grad_norm": 0.3365763346832491, "learning_rate": 5.607023876493075e-06, "loss": 0.3251, "num_tokens": 2609779662.0, "step": 9970 }, { "epoch": 4.65034965034965, "grad_norm": 0.3371348358061654, "learning_rate": 5.59904619853548e-06, "loss": 0.314, "num_tokens": 2611090382.0, "step": 9975 }, { "epoch": 4.652680652680653, "grad_norm": 0.3211402055694574, "learning_rate": 5.591120583054602e-06, "loss": 0.3172, "num_tokens": 2612401102.0, "step": 9980 }, { "epoch": 4.655011655011655, "grad_norm": 0.32650824383942834, "learning_rate": 5.583247048891042e-06, "loss": 0.3177, "num_tokens": 2613704775.0, "step": 9985 }, { "epoch": 4.6573426573426575, "grad_norm": 0.33015275188111703, "learning_rate": 5.575425614761597e-06, "loss": 0.3105, "num_tokens": 2615005017.0, "step": 9990 }, { "epoch": 4.659673659673659, "grad_norm": 0.3350066530847012, "learning_rate": 5.567656299259212e-06, "loss": 0.3179, "num_tokens": 2616315737.0, "step": 9995 }, { "epoch": 4.662004662004662, "grad_norm": 0.3296673905861499, "learning_rate": 5.559939120852936e-06, "loss": 0.3183, "num_tokens": 2617626457.0, "step": 10000 }, { "epoch": 4.664335664335664, "grad_norm": 0.3314605447902211, "learning_rate": 5.552274097887879e-06, "loss": 0.311, "num_tokens": 2618937177.0, "step": 10005 }, { "epoch": 4.666666666666667, "grad_norm": 0.31963246919644556, "learning_rate": 5.544661248585172e-06, "loss": 0.3148, "num_tokens": 2620238809.0, "step": 10010 }, { "epoch": 4.668997668997669, "grad_norm": 0.32606186826503414, "learning_rate": 5.537100591041915e-06, "loss": 0.3197, "num_tokens": 2621537881.0, "step": 10015 }, { "epoch": 4.671328671328672, "grad_norm": 0.3127362641909877, "learning_rate": 5.529592143231142e-06, "loss": 0.3187, "num_tokens": 2622848601.0, "step": 10020 }, { "epoch": 4.673659673659674, "grad_norm": 0.34140879351797326, "learning_rate": 5.522135923001767e-06, "loss": 0.3129, "num_tokens": 2624159321.0, "step": 10025 }, { "epoch": 4.6759906759906755, "grad_norm": 0.32642677770519635, "learning_rate": 5.514731948078565e-06, "loss": 0.3089, "num_tokens": 2625454738.0, "step": 10030 }, { "epoch": 4.678321678321678, "grad_norm": 0.329034389428116, "learning_rate": 5.5073802360621035e-06, "loss": 0.315, "num_tokens": 2626765458.0, "step": 10035 }, { "epoch": 4.680652680652681, "grad_norm": 0.33163073393602516, "learning_rate": 5.50008080442871e-06, "loss": 0.3146, "num_tokens": 2628076178.0, "step": 10040 }, { "epoch": 4.682983682983683, "grad_norm": 0.3272067629094256, "learning_rate": 5.492833670530445e-06, "loss": 0.3173, "num_tokens": 2629386898.0, "step": 10045 }, { "epoch": 4.685314685314685, "grad_norm": 0.3144574455080941, "learning_rate": 5.485638851595033e-06, "loss": 0.3054, "num_tokens": 2630697618.0, "step": 10050 }, { "epoch": 4.687645687645688, "grad_norm": 0.33627454004902, "learning_rate": 5.478496364725844e-06, "loss": 0.3188, "num_tokens": 2632008338.0, "step": 10055 }, { "epoch": 4.68997668997669, "grad_norm": 0.3353988218499314, "learning_rate": 5.471406226901843e-06, "loss": 0.3178, "num_tokens": 2633315147.0, "step": 10060 }, { "epoch": 4.6923076923076925, "grad_norm": 0.34081231503644693, "learning_rate": 5.464368454977559e-06, "loss": 0.3181, "num_tokens": 2634625867.0, "step": 10065 }, { "epoch": 4.694638694638694, "grad_norm": 0.32001347894247395, "learning_rate": 5.457383065683023e-06, "loss": 0.3094, "num_tokens": 2635936587.0, "step": 10070 }, { "epoch": 4.696969696969697, "grad_norm": 0.33209202640372304, "learning_rate": 5.450450075623761e-06, "loss": 0.3203, "num_tokens": 2637233194.0, "step": 10075 }, { "epoch": 4.699300699300699, "grad_norm": 0.3444659685507314, "learning_rate": 5.443569501280724e-06, "loss": 0.3298, "num_tokens": 2638543914.0, "step": 10080 }, { "epoch": 4.701631701631702, "grad_norm": 0.32572981399860407, "learning_rate": 5.436741359010265e-06, "loss": 0.3145, "num_tokens": 2639854634.0, "step": 10085 }, { "epoch": 4.703962703962704, "grad_norm": 0.3265125453077855, "learning_rate": 5.429965665044099e-06, "loss": 0.3113, "num_tokens": 2641165354.0, "step": 10090 }, { "epoch": 4.706293706293707, "grad_norm": 0.32908645785435287, "learning_rate": 5.4232424354892605e-06, "loss": 0.3259, "num_tokens": 2642476074.0, "step": 10095 }, { "epoch": 4.708624708624709, "grad_norm": 0.32471263137765566, "learning_rate": 5.4165716863280626e-06, "loss": 0.3148, "num_tokens": 2643786794.0, "step": 10100 }, { "epoch": 4.7109557109557105, "grad_norm": 0.33045093010828397, "learning_rate": 5.409953433418071e-06, "loss": 0.3265, "num_tokens": 2645097514.0, "step": 10105 }, { "epoch": 4.713286713286713, "grad_norm": 0.3293031597544229, "learning_rate": 5.403387692492053e-06, "loss": 0.312, "num_tokens": 2646390978.0, "step": 10110 }, { "epoch": 4.715617715617715, "grad_norm": 0.33516744422096806, "learning_rate": 5.396874479157943e-06, "loss": 0.3169, "num_tokens": 2647689285.0, "step": 10115 }, { "epoch": 4.717948717948718, "grad_norm": 0.3351308591346485, "learning_rate": 5.39041380889882e-06, "loss": 0.3235, "num_tokens": 2648996984.0, "step": 10120 }, { "epoch": 4.72027972027972, "grad_norm": 0.3345027011060605, "learning_rate": 5.384005697072842e-06, "loss": 0.308, "num_tokens": 2650307704.0, "step": 10125 }, { "epoch": 4.722610722610723, "grad_norm": 0.326143155782565, "learning_rate": 5.377650158913239e-06, "loss": 0.3272, "num_tokens": 2651618424.0, "step": 10130 }, { "epoch": 4.724941724941725, "grad_norm": 0.3382309775663869, "learning_rate": 5.371347209528259e-06, "loss": 0.3201, "num_tokens": 2652929144.0, "step": 10135 }, { "epoch": 4.7272727272727275, "grad_norm": 0.3229354536782934, "learning_rate": 5.365096863901139e-06, "loss": 0.317, "num_tokens": 2654239864.0, "step": 10140 }, { "epoch": 4.729603729603729, "grad_norm": 0.3265549491709372, "learning_rate": 5.3588991368900655e-06, "loss": 0.3197, "num_tokens": 2655550584.0, "step": 10145 }, { "epoch": 4.731934731934732, "grad_norm": 0.3317883087071852, "learning_rate": 5.352754043228138e-06, "loss": 0.3105, "num_tokens": 2656861304.0, "step": 10150 }, { "epoch": 4.734265734265734, "grad_norm": 0.30886908963659704, "learning_rate": 5.346661597523347e-06, "loss": 0.3183, "num_tokens": 2658172024.0, "step": 10155 }, { "epoch": 4.736596736596737, "grad_norm": 0.3265593067229369, "learning_rate": 5.340621814258523e-06, "loss": 0.3113, "num_tokens": 2659482744.0, "step": 10160 }, { "epoch": 4.738927738927739, "grad_norm": 0.34583140436007476, "learning_rate": 5.334634707791303e-06, "loss": 0.3193, "num_tokens": 2660789135.0, "step": 10165 }, { "epoch": 4.741258741258742, "grad_norm": 0.33571097006056455, "learning_rate": 5.328700292354117e-06, "loss": 0.3122, "num_tokens": 2662099855.0, "step": 10170 }, { "epoch": 4.743589743589744, "grad_norm": 0.3233278556899711, "learning_rate": 5.322818582054123e-06, "loss": 0.3159, "num_tokens": 2663410575.0, "step": 10175 }, { "epoch": 4.7459207459207455, "grad_norm": 0.3317303155316829, "learning_rate": 5.316989590873196e-06, "loss": 0.3194, "num_tokens": 2664721295.0, "step": 10180 }, { "epoch": 4.748251748251748, "grad_norm": 0.33625182489617234, "learning_rate": 5.311213332667893e-06, "loss": 0.3163, "num_tokens": 2666032015.0, "step": 10185 }, { "epoch": 4.75058275058275, "grad_norm": 0.3356074033954618, "learning_rate": 5.305489821169408e-06, "loss": 0.3078, "num_tokens": 2667342735.0, "step": 10190 }, { "epoch": 4.752913752913753, "grad_norm": 0.34585034308778323, "learning_rate": 5.2998190699835485e-06, "loss": 0.3257, "num_tokens": 2668653455.0, "step": 10195 }, { "epoch": 4.755244755244755, "grad_norm": 0.3657646741180601, "learning_rate": 5.2942010925907074e-06, "loss": 0.3309, "num_tokens": 2669964175.0, "step": 10200 }, { "epoch": 4.757575757575758, "grad_norm": 0.3300884821473932, "learning_rate": 5.288635902345814e-06, "loss": 0.3172, "num_tokens": 2671274895.0, "step": 10205 }, { "epoch": 4.75990675990676, "grad_norm": 0.3148848991416642, "learning_rate": 5.283123512478321e-06, "loss": 0.3097, "num_tokens": 2672585615.0, "step": 10210 }, { "epoch": 4.7622377622377625, "grad_norm": 0.3248022910063405, "learning_rate": 5.2776639360921664e-06, "loss": 0.3113, "num_tokens": 2673881785.0, "step": 10215 }, { "epoch": 4.764568764568764, "grad_norm": 0.32808387177263987, "learning_rate": 5.272257186165733e-06, "loss": 0.3208, "num_tokens": 2675192505.0, "step": 10220 }, { "epoch": 4.766899766899767, "grad_norm": 0.3269866573020375, "learning_rate": 5.26690327555183e-06, "loss": 0.3149, "num_tokens": 2676503225.0, "step": 10225 }, { "epoch": 4.769230769230769, "grad_norm": 0.332961069358366, "learning_rate": 5.261602216977668e-06, "loss": 0.3145, "num_tokens": 2677813945.0, "step": 10230 }, { "epoch": 4.771561771561771, "grad_norm": 0.33357108039723987, "learning_rate": 5.256354023044799e-06, "loss": 0.324, "num_tokens": 2679124665.0, "step": 10235 }, { "epoch": 4.773892773892774, "grad_norm": 0.3288868211446766, "learning_rate": 5.251158706229117e-06, "loss": 0.318, "num_tokens": 2680435385.0, "step": 10240 }, { "epoch": 4.776223776223777, "grad_norm": 0.3295445331080107, "learning_rate": 5.246016278880824e-06, "loss": 0.3233, "num_tokens": 2681746105.0, "step": 10245 }, { "epoch": 4.778554778554779, "grad_norm": 0.3456197462845134, "learning_rate": 5.240926753224386e-06, "loss": 0.3186, "num_tokens": 2683056825.0, "step": 10250 }, { "epoch": 4.7808857808857805, "grad_norm": 0.3204798461467708, "learning_rate": 5.235890141358512e-06, "loss": 0.3118, "num_tokens": 2684367545.0, "step": 10255 }, { "epoch": 4.783216783216783, "grad_norm": 0.330521485739032, "learning_rate": 5.230906455256126e-06, "loss": 0.319, "num_tokens": 2685678265.0, "step": 10260 }, { "epoch": 4.785547785547785, "grad_norm": 0.32924346560709944, "learning_rate": 5.225975706764347e-06, "loss": 0.3112, "num_tokens": 2686988499.0, "step": 10265 }, { "epoch": 4.787878787878788, "grad_norm": 0.3368638495877885, "learning_rate": 5.221097907604436e-06, "loss": 0.3194, "num_tokens": 2688299219.0, "step": 10270 }, { "epoch": 4.79020979020979, "grad_norm": 0.3263693614554864, "learning_rate": 5.216273069371794e-06, "loss": 0.3189, "num_tokens": 2689609939.0, "step": 10275 }, { "epoch": 4.792540792540793, "grad_norm": 0.334167060120909, "learning_rate": 5.211501203535926e-06, "loss": 0.316, "num_tokens": 2690918769.0, "step": 10280 }, { "epoch": 4.794871794871795, "grad_norm": 0.33700365497537554, "learning_rate": 5.2067823214404076e-06, "loss": 0.3136, "num_tokens": 2692229489.0, "step": 10285 }, { "epoch": 4.7972027972027975, "grad_norm": 0.3550316959254366, "learning_rate": 5.2021164343028615e-06, "loss": 0.3226, "num_tokens": 2693540209.0, "step": 10290 }, { "epoch": 4.799533799533799, "grad_norm": 0.32560252408951285, "learning_rate": 5.1975035532149374e-06, "loss": 0.3153, "num_tokens": 2694850929.0, "step": 10295 }, { "epoch": 4.801864801864802, "grad_norm": 0.33773170010312276, "learning_rate": 5.192943689142276e-06, "loss": 0.3197, "num_tokens": 2696161649.0, "step": 10300 }, { "epoch": 4.804195804195804, "grad_norm": 0.32245870964120255, "learning_rate": 5.188436852924488e-06, "loss": 0.3096, "num_tokens": 2697472369.0, "step": 10305 }, { "epoch": 4.806526806526806, "grad_norm": 0.3157741766261794, "learning_rate": 5.183983055275129e-06, "loss": 0.318, "num_tokens": 2698783089.0, "step": 10310 }, { "epoch": 4.808857808857809, "grad_norm": 0.3230046443910475, "learning_rate": 5.17958230678167e-06, "loss": 0.3038, "num_tokens": 2700093809.0, "step": 10315 }, { "epoch": 4.811188811188811, "grad_norm": 0.32531156650812953, "learning_rate": 5.175234617905471e-06, "loss": 0.3056, "num_tokens": 2701404529.0, "step": 10320 }, { "epoch": 4.813519813519814, "grad_norm": 0.3377022386596855, "learning_rate": 5.170939998981775e-06, "loss": 0.3138, "num_tokens": 2702706568.0, "step": 10325 }, { "epoch": 4.8158508158508155, "grad_norm": 0.3440675929974891, "learning_rate": 5.16669846021965e-06, "loss": 0.3208, "num_tokens": 2704003885.0, "step": 10330 }, { "epoch": 4.818181818181818, "grad_norm": 0.3293134125006892, "learning_rate": 5.162510011701991e-06, "loss": 0.313, "num_tokens": 2705310038.0, "step": 10335 }, { "epoch": 4.82051282051282, "grad_norm": 0.34589478246737115, "learning_rate": 5.15837466338549e-06, "loss": 0.3227, "num_tokens": 2706605004.0, "step": 10340 }, { "epoch": 4.822843822843823, "grad_norm": 0.3250195530849573, "learning_rate": 5.15429242510061e-06, "loss": 0.3095, "num_tokens": 2707915724.0, "step": 10345 }, { "epoch": 4.825174825174825, "grad_norm": 0.33813235349784826, "learning_rate": 5.150263306551556e-06, "loss": 0.3176, "num_tokens": 2709226444.0, "step": 10350 }, { "epoch": 4.827505827505828, "grad_norm": 0.3342791833320397, "learning_rate": 5.146287317316262e-06, "loss": 0.3177, "num_tokens": 2710537164.0, "step": 10355 }, { "epoch": 4.82983682983683, "grad_norm": 0.32303166071349837, "learning_rate": 5.1423644668463695e-06, "loss": 0.3127, "num_tokens": 2711847884.0, "step": 10360 }, { "epoch": 4.8321678321678325, "grad_norm": 0.34342920997262655, "learning_rate": 5.138494764467189e-06, "loss": 0.3207, "num_tokens": 2713158604.0, "step": 10365 }, { "epoch": 4.834498834498834, "grad_norm": 0.3370264711785083, "learning_rate": 5.134678219377695e-06, "loss": 0.3169, "num_tokens": 2714469324.0, "step": 10370 }, { "epoch": 4.836829836829837, "grad_norm": 0.33019492580209653, "learning_rate": 5.1309148406505e-06, "loss": 0.319, "num_tokens": 2715780044.0, "step": 10375 }, { "epoch": 4.839160839160839, "grad_norm": 0.3296889225888387, "learning_rate": 5.127204637231821e-06, "loss": 0.3096, "num_tokens": 2717090764.0, "step": 10380 }, { "epoch": 4.841491841491841, "grad_norm": 0.3170653884338263, "learning_rate": 5.12354761794148e-06, "loss": 0.3151, "num_tokens": 2718401484.0, "step": 10385 }, { "epoch": 4.843822843822844, "grad_norm": 0.3171594099332368, "learning_rate": 5.1199437914728596e-06, "loss": 0.3121, "num_tokens": 2719712204.0, "step": 10390 }, { "epoch": 4.846153846153846, "grad_norm": 0.333775671862216, "learning_rate": 5.116393166392901e-06, "loss": 0.3082, "num_tokens": 2721022924.0, "step": 10395 }, { "epoch": 4.848484848484849, "grad_norm": 0.34403397168216404, "learning_rate": 5.112895751142073e-06, "loss": 0.3231, "num_tokens": 2722333644.0, "step": 10400 }, { "epoch": 4.8508158508158505, "grad_norm": 0.31993219706477866, "learning_rate": 5.109451554034357e-06, "loss": 0.3184, "num_tokens": 2723644364.0, "step": 10405 }, { "epoch": 4.853146853146853, "grad_norm": 0.343409768713205, "learning_rate": 5.1060605832572235e-06, "loss": 0.3171, "num_tokens": 2724955084.0, "step": 10410 }, { "epoch": 4.855477855477855, "grad_norm": 0.32246016600388927, "learning_rate": 5.102722846871616e-06, "loss": 0.3084, "num_tokens": 2726265804.0, "step": 10415 }, { "epoch": 4.857808857808858, "grad_norm": 0.3418584388038729, "learning_rate": 5.099438352811931e-06, "loss": 0.3302, "num_tokens": 2727576524.0, "step": 10420 }, { "epoch": 4.86013986013986, "grad_norm": 0.3242160539074971, "learning_rate": 5.0962071088859935e-06, "loss": 0.3102, "num_tokens": 2728887244.0, "step": 10425 }, { "epoch": 4.862470862470863, "grad_norm": 0.3149813322054902, "learning_rate": 5.093029122775049e-06, "loss": 0.3071, "num_tokens": 2730197964.0, "step": 10430 }, { "epoch": 4.864801864801865, "grad_norm": 0.3334336110289972, "learning_rate": 5.08990440203374e-06, "loss": 0.3203, "num_tokens": 2731508684.0, "step": 10435 }, { "epoch": 4.867132867132867, "grad_norm": 0.32299813348426, "learning_rate": 5.086832954090082e-06, "loss": 0.313, "num_tokens": 2732819404.0, "step": 10440 }, { "epoch": 4.869463869463869, "grad_norm": 0.3364027385588916, "learning_rate": 5.083814786245458e-06, "loss": 0.3179, "num_tokens": 2734130124.0, "step": 10445 }, { "epoch": 4.871794871794872, "grad_norm": 0.3656168760454608, "learning_rate": 5.080849905674588e-06, "loss": 0.3201, "num_tokens": 2735440844.0, "step": 10450 }, { "epoch": 4.874125874125874, "grad_norm": 0.33683461461528713, "learning_rate": 5.077938319425526e-06, "loss": 0.3186, "num_tokens": 2736751564.0, "step": 10455 }, { "epoch": 4.876456876456876, "grad_norm": 0.3300837602076609, "learning_rate": 5.075080034419631e-06, "loss": 0.3262, "num_tokens": 2738062284.0, "step": 10460 }, { "epoch": 4.878787878787879, "grad_norm": 0.348047283341893, "learning_rate": 5.072275057451558e-06, "loss": 0.3164, "num_tokens": 2739356820.0, "step": 10465 }, { "epoch": 4.881118881118881, "grad_norm": 0.32910332567884476, "learning_rate": 5.0695233951892345e-06, "loss": 0.3107, "num_tokens": 2740662089.0, "step": 10470 }, { "epoch": 4.883449883449884, "grad_norm": 0.3200833744056691, "learning_rate": 5.066825054173854e-06, "loss": 0.3117, "num_tokens": 2741972809.0, "step": 10475 }, { "epoch": 4.8857808857808855, "grad_norm": 0.3341587779600143, "learning_rate": 5.064180040819858e-06, "loss": 0.3179, "num_tokens": 2743283529.0, "step": 10480 }, { "epoch": 4.888111888111888, "grad_norm": 0.33148410077095414, "learning_rate": 5.0615883614149136e-06, "loss": 0.3172, "num_tokens": 2744594249.0, "step": 10485 }, { "epoch": 4.89044289044289, "grad_norm": 0.3272484124935763, "learning_rate": 5.059050022119904e-06, "loss": 0.3165, "num_tokens": 2745887369.0, "step": 10490 }, { "epoch": 4.892773892773893, "grad_norm": 0.3209922201282976, "learning_rate": 5.056565028968916e-06, "loss": 0.3154, "num_tokens": 2747198089.0, "step": 10495 }, { "epoch": 4.895104895104895, "grad_norm": 0.3269414783389918, "learning_rate": 5.05413338786922e-06, "loss": 0.312, "num_tokens": 2748508809.0, "step": 10500 }, { "epoch": 4.897435897435898, "grad_norm": 0.31618238859555264, "learning_rate": 5.051755104601264e-06, "loss": 0.3143, "num_tokens": 2749819529.0, "step": 10505 }, { "epoch": 4.8997668997669, "grad_norm": 0.32153457363047355, "learning_rate": 5.049430184818651e-06, "loss": 0.3224, "num_tokens": 2751117328.0, "step": 10510 }, { "epoch": 4.902097902097902, "grad_norm": 0.31465978988545573, "learning_rate": 5.047158634048129e-06, "loss": 0.3177, "num_tokens": 2752420296.0, "step": 10515 }, { "epoch": 4.9044289044289044, "grad_norm": 0.31044680611687897, "learning_rate": 5.044940457689581e-06, "loss": 0.3105, "num_tokens": 2753731016.0, "step": 10520 }, { "epoch": 4.906759906759907, "grad_norm": 0.31811560483359963, "learning_rate": 5.042775661016008e-06, "loss": 0.3169, "num_tokens": 2755041736.0, "step": 10525 }, { "epoch": 4.909090909090909, "grad_norm": 0.31742684514452485, "learning_rate": 5.040664249173518e-06, "loss": 0.305, "num_tokens": 2756352456.0, "step": 10530 }, { "epoch": 4.911421911421911, "grad_norm": 0.33606166438372637, "learning_rate": 5.038606227181312e-06, "loss": 0.3182, "num_tokens": 2757663176.0, "step": 10535 }, { "epoch": 4.913752913752914, "grad_norm": 0.3205242654431987, "learning_rate": 5.0366015999316775e-06, "loss": 0.3147, "num_tokens": 2758973413.0, "step": 10540 }, { "epoch": 4.916083916083916, "grad_norm": 0.3427922994912874, "learning_rate": 5.034650372189974e-06, "loss": 0.3125, "num_tokens": 2760284133.0, "step": 10545 }, { "epoch": 4.918414918414919, "grad_norm": 0.3310997932717806, "learning_rate": 5.0327525485946135e-06, "loss": 0.3184, "num_tokens": 2761592831.0, "step": 10550 }, { "epoch": 4.9207459207459205, "grad_norm": 0.3393685987917944, "learning_rate": 5.030908133657063e-06, "loss": 0.3156, "num_tokens": 2762895660.0, "step": 10555 }, { "epoch": 4.923076923076923, "grad_norm": 0.3322488130315273, "learning_rate": 5.029117131761826e-06, "loss": 0.3213, "num_tokens": 2764206380.0, "step": 10560 }, { "epoch": 4.925407925407925, "grad_norm": 0.35642657501632585, "learning_rate": 5.027379547166436e-06, "loss": 0.3173, "num_tokens": 2765517100.0, "step": 10565 }, { "epoch": 4.927738927738928, "grad_norm": 0.33722387753318867, "learning_rate": 5.025695384001438e-06, "loss": 0.3297, "num_tokens": 2766827820.0, "step": 10570 }, { "epoch": 4.93006993006993, "grad_norm": 0.32350013184482923, "learning_rate": 5.02406464627039e-06, "loss": 0.3141, "num_tokens": 2768138540.0, "step": 10575 }, { "epoch": 4.932400932400933, "grad_norm": 0.31985329238680343, "learning_rate": 5.0224873378498475e-06, "loss": 0.3103, "num_tokens": 2769449260.0, "step": 10580 }, { "epoch": 4.934731934731935, "grad_norm": 0.33320437693468646, "learning_rate": 5.0209634624893535e-06, "loss": 0.316, "num_tokens": 2770759980.0, "step": 10585 }, { "epoch": 4.937062937062937, "grad_norm": 0.34199930181650334, "learning_rate": 5.0194930238114344e-06, "loss": 0.3165, "num_tokens": 2772070700.0, "step": 10590 }, { "epoch": 4.9393939393939394, "grad_norm": 0.33840903211030815, "learning_rate": 5.01807602531158e-06, "loss": 0.3279, "num_tokens": 2773381420.0, "step": 10595 }, { "epoch": 4.941724941724941, "grad_norm": 0.33970146942955454, "learning_rate": 5.016712470358254e-06, "loss": 0.3243, "num_tokens": 2774692140.0, "step": 10600 }, { "epoch": 4.944055944055944, "grad_norm": 0.32417078423146617, "learning_rate": 5.015402362192865e-06, "loss": 0.3095, "num_tokens": 2776002860.0, "step": 10605 }, { "epoch": 4.946386946386946, "grad_norm": 0.31477911467606806, "learning_rate": 5.0141457039297765e-06, "loss": 0.3152, "num_tokens": 2777313580.0, "step": 10610 }, { "epoch": 4.948717948717949, "grad_norm": 0.3382420127269151, "learning_rate": 5.012942498556292e-06, "loss": 0.3145, "num_tokens": 2778624300.0, "step": 10615 }, { "epoch": 4.951048951048951, "grad_norm": 0.31746762640712656, "learning_rate": 5.011792748932641e-06, "loss": 0.3067, "num_tokens": 2779935020.0, "step": 10620 }, { "epoch": 4.953379953379954, "grad_norm": 0.3281133514868718, "learning_rate": 5.010696457791986e-06, "loss": 0.3132, "num_tokens": 2781245740.0, "step": 10625 }, { "epoch": 4.9557109557109555, "grad_norm": 0.3239412806584807, "learning_rate": 5.009653627740407e-06, "loss": 0.3212, "num_tokens": 2782556460.0, "step": 10630 }, { "epoch": 4.958041958041958, "grad_norm": 0.3368010167927629, "learning_rate": 5.008664261256898e-06, "loss": 0.3145, "num_tokens": 2783867180.0, "step": 10635 }, { "epoch": 4.96037296037296, "grad_norm": 0.34491073030538105, "learning_rate": 5.007728360693355e-06, "loss": 0.3176, "num_tokens": 2785177900.0, "step": 10640 }, { "epoch": 4.962703962703963, "grad_norm": 0.3510601956916241, "learning_rate": 5.006845928274586e-06, "loss": 0.3187, "num_tokens": 2786488620.0, "step": 10645 }, { "epoch": 4.965034965034965, "grad_norm": 0.3337835159331755, "learning_rate": 5.006016966098288e-06, "loss": 0.314, "num_tokens": 2787799340.0, "step": 10650 }, { "epoch": 4.967365967365968, "grad_norm": 0.3172585463158374, "learning_rate": 5.005241476135051e-06, "loss": 0.317, "num_tokens": 2789110060.0, "step": 10655 }, { "epoch": 4.96969696969697, "grad_norm": 0.32635822297017036, "learning_rate": 5.004519460228356e-06, "loss": 0.3102, "num_tokens": 2790420780.0, "step": 10660 }, { "epoch": 4.972027972027972, "grad_norm": 0.34214637134938164, "learning_rate": 5.003850920094564e-06, "loss": 0.3127, "num_tokens": 2791731500.0, "step": 10665 }, { "epoch": 4.9743589743589745, "grad_norm": 0.31863256082976954, "learning_rate": 5.00323585732291e-06, "loss": 0.3037, "num_tokens": 2793042220.0, "step": 10670 }, { "epoch": 4.976689976689976, "grad_norm": 0.33066360911088155, "learning_rate": 5.00267427337551e-06, "loss": 0.3155, "num_tokens": 2794352940.0, "step": 10675 }, { "epoch": 4.979020979020979, "grad_norm": 0.32620437111284734, "learning_rate": 5.002166169587351e-06, "loss": 0.3171, "num_tokens": 2795657814.0, "step": 10680 }, { "epoch": 4.981351981351981, "grad_norm": 0.33430055011282034, "learning_rate": 5.001711547166285e-06, "loss": 0.3189, "num_tokens": 2796968534.0, "step": 10685 }, { "epoch": 4.983682983682984, "grad_norm": 0.3224437309450024, "learning_rate": 5.001310407193031e-06, "loss": 0.3223, "num_tokens": 2798279254.0, "step": 10690 }, { "epoch": 4.986013986013986, "grad_norm": 0.32410671370691807, "learning_rate": 5.000962750621168e-06, "loss": 0.3311, "num_tokens": 2799589974.0, "step": 10695 }, { "epoch": 4.988344988344989, "grad_norm": 0.3235106959855654, "learning_rate": 5.0006685782771445e-06, "loss": 0.3132, "num_tokens": 2800887285.0, "step": 10700 }, { "epoch": 4.9906759906759905, "grad_norm": 0.3305908727408153, "learning_rate": 5.000427890860252e-06, "loss": 0.3113, "num_tokens": 2802198005.0, "step": 10705 }, { "epoch": 4.993006993006993, "grad_norm": 0.33403825665973846, "learning_rate": 5.000240688942652e-06, "loss": 0.3186, "num_tokens": 2803508725.0, "step": 10710 }, { "epoch": 4.995337995337995, "grad_norm": 0.3278267015029189, "learning_rate": 5.000106972969358e-06, "loss": 0.3166, "num_tokens": 2804819445.0, "step": 10715 }, { "epoch": 4.997668997668997, "grad_norm": 0.3295223914621982, "learning_rate": 5.000026743258234e-06, "loss": 0.3119, "num_tokens": 2806130165.0, "step": 10720 }, { "epoch": 5.0, "grad_norm": 0.3242625515113468, "learning_rate": 5e-06, "loss": 0.3108, "num_tokens": 2807440885.0, "step": 10725 }, { "epoch": 5.0, "step": 10725, "total_flos": 2444245755494400.0, "train_loss": 0.42360519842668015, "train_runtime": 82605.1286, "train_samples_per_second": 2.077, "train_steps_per_second": 0.13 } ], "logging_steps": 5, "max_steps": 10725, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2444245755494400.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }