| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 25.0, | |
| "eval_steps": 500, | |
| "global_step": 132825, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.09410878976096368, | |
| "grad_norm": 7.552914619445801, | |
| "learning_rate": 4.843152017065061e-05, | |
| "loss": 1.9573, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18821757952192736, | |
| "grad_norm": 10.377354621887207, | |
| "learning_rate": 4.686304034130121e-05, | |
| "loss": 1.6637, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.282326369282891, | |
| "grad_norm": 10.336068153381348, | |
| "learning_rate": 4.529456051195182e-05, | |
| "loss": 1.6903, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3764351590438547, | |
| "grad_norm": 4.373341083526611, | |
| "learning_rate": 4.372608068260242e-05, | |
| "loss": 1.7066, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.47054394880481837, | |
| "grad_norm": 6.67123556137085, | |
| "learning_rate": 4.2157600853253026e-05, | |
| "loss": 1.6262, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.564652738565782, | |
| "grad_norm": 7.045924663543701, | |
| "learning_rate": 4.0589121023903634e-05, | |
| "loss": 1.6133, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6587615283267457, | |
| "grad_norm": 8.646467208862305, | |
| "learning_rate": 3.902064119455424e-05, | |
| "loss": 1.5958, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7528703180877094, | |
| "grad_norm": 17.767257690429688, | |
| "learning_rate": 3.745216136520485e-05, | |
| "loss": 1.5926, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.846979107848673, | |
| "grad_norm": 6.851809978485107, | |
| "learning_rate": 3.588368153585545e-05, | |
| "loss": 1.5898, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9410878976096367, | |
| "grad_norm": 6.367198467254639, | |
| "learning_rate": 3.431520170650606e-05, | |
| "loss": 1.5842, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.0351966873706004, | |
| "grad_norm": 3.8963205814361572, | |
| "learning_rate": 3.274672187715666e-05, | |
| "loss": 1.6023, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.129305477131564, | |
| "grad_norm": 3.492241382598877, | |
| "learning_rate": 3.1178242047807265e-05, | |
| "loss": 1.5835, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.2234142668925279, | |
| "grad_norm": 6.9010396003723145, | |
| "learning_rate": 2.960976221845787e-05, | |
| "loss": 1.5804, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.3175230566534915, | |
| "grad_norm": 7.097581386566162, | |
| "learning_rate": 2.8041282389108477e-05, | |
| "loss": 1.5685, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.411631846414455, | |
| "grad_norm": 4.106161117553711, | |
| "learning_rate": 2.647280255975908e-05, | |
| "loss": 1.5557, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.5057406361754189, | |
| "grad_norm": 5.339535236358643, | |
| "learning_rate": 2.490432273040969e-05, | |
| "loss": 1.5266, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.5998494259363825, | |
| "grad_norm": 5.467094421386719, | |
| "learning_rate": 2.3335842901060293e-05, | |
| "loss": 1.5394, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.693958215697346, | |
| "grad_norm": 5.166747093200684, | |
| "learning_rate": 2.17673630717109e-05, | |
| "loss": 1.5131, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.7880670054583097, | |
| "grad_norm": 3.2604665756225586, | |
| "learning_rate": 2.0198883242361504e-05, | |
| "loss": 1.4695, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.8821757952192735, | |
| "grad_norm": 6.9691362380981445, | |
| "learning_rate": 1.8630403413012108e-05, | |
| "loss": 1.4532, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.9762845849802373, | |
| "grad_norm": 3.044600248336792, | |
| "learning_rate": 1.7061923583662716e-05, | |
| "loss": 1.4349, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.070393374741201, | |
| "grad_norm": 5.485976219177246, | |
| "learning_rate": 1.549344375431332e-05, | |
| "loss": 1.4098, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.1645021645021645, | |
| "grad_norm": 3.2020225524902344, | |
| "learning_rate": 1.3924963924963927e-05, | |
| "loss": 1.4025, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.258610954263128, | |
| "grad_norm": 4.406314849853516, | |
| "learning_rate": 1.235648409561453e-05, | |
| "loss": 1.384, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.3527197440240917, | |
| "grad_norm": 7.23325252532959, | |
| "learning_rate": 1.0788004266265137e-05, | |
| "loss": 1.3831, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.4468285337850557, | |
| "grad_norm": 6.645949840545654, | |
| "learning_rate": 9.219524436915741e-06, | |
| "loss": 1.3611, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.5409373235460193, | |
| "grad_norm": 4.476417064666748, | |
| "learning_rate": 7.651044607566347e-06, | |
| "loss": 1.3478, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.635046113306983, | |
| "grad_norm": 4.497859954833984, | |
| "learning_rate": 6.082564778216952e-06, | |
| "loss": 1.3354, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.7291549030679465, | |
| "grad_norm": 4.2654008865356445, | |
| "learning_rate": 4.514084948867558e-06, | |
| "loss": 1.3229, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.82326369282891, | |
| "grad_norm": 3.607623338699341, | |
| "learning_rate": 2.945605119518163e-06, | |
| "loss": 1.3001, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.917372482589874, | |
| "grad_norm": 3.7838690280914307, | |
| "learning_rate": 1.3771252901687685e-06, | |
| "loss": 1.2967, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.0114812723508377, | |
| "grad_norm": 7.810318946838379, | |
| "learning_rate": 3.494259363824581e-05, | |
| "loss": 1.4147, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.1055900621118013, | |
| "grad_norm": 3.653393030166626, | |
| "learning_rate": 3.4472049689440996e-05, | |
| "loss": 1.4375, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.199698851872765, | |
| "grad_norm": 6.7401018142700195, | |
| "learning_rate": 3.400150574063617e-05, | |
| "loss": 1.3906, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.2938076416337285, | |
| "grad_norm": 4.238823890686035, | |
| "learning_rate": 3.3530961791831364e-05, | |
| "loss": 1.3716, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.387916431394692, | |
| "grad_norm": 2.793461561203003, | |
| "learning_rate": 3.306041784302654e-05, | |
| "loss": 1.3764, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.4820252211556557, | |
| "grad_norm": 3.1460959911346436, | |
| "learning_rate": 3.2589873894221726e-05, | |
| "loss": 1.3557, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.5761340109166198, | |
| "grad_norm": 6.384432792663574, | |
| "learning_rate": 3.2119329945416903e-05, | |
| "loss": 1.3503, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.6702428006775834, | |
| "grad_norm": 3.8678534030914307, | |
| "learning_rate": 3.164878599661209e-05, | |
| "loss": 1.3407, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.764351590438547, | |
| "grad_norm": 2.776543140411377, | |
| "learning_rate": 3.1178242047807265e-05, | |
| "loss": 1.3593, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.8584603801995105, | |
| "grad_norm": 8.332947731018066, | |
| "learning_rate": 3.070769809900245e-05, | |
| "loss": 1.3323, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.9525691699604746, | |
| "grad_norm": 5.115898609161377, | |
| "learning_rate": 3.0237154150197627e-05, | |
| "loss": 1.3184, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.046677959721438, | |
| "grad_norm": 5.099429607391357, | |
| "learning_rate": 2.9766610201392815e-05, | |
| "loss": 1.3082, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 4.140786749482402, | |
| "grad_norm": 3.901289939880371, | |
| "learning_rate": 2.9296066252587996e-05, | |
| "loss": 1.3038, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.234895539243365, | |
| "grad_norm": 3.7559354305267334, | |
| "learning_rate": 2.8825522303783176e-05, | |
| "loss": 1.2892, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 4.329004329004329, | |
| "grad_norm": 4.156054973602295, | |
| "learning_rate": 2.8354978354978357e-05, | |
| "loss": 1.2893, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 4.423113118765293, | |
| "grad_norm": 3.1980538368225098, | |
| "learning_rate": 2.7884434406173538e-05, | |
| "loss": 1.288, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 4.517221908526256, | |
| "grad_norm": 2.7643442153930664, | |
| "learning_rate": 2.741389045736872e-05, | |
| "loss": 1.2666, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.61133069828722, | |
| "grad_norm": 2.195343017578125, | |
| "learning_rate": 2.69433465085639e-05, | |
| "loss": 1.2632, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 4.705439488048183, | |
| "grad_norm": 2.6611695289611816, | |
| "learning_rate": 2.647280255975908e-05, | |
| "loss": 1.262, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 4.799548277809148, | |
| "grad_norm": 5.055428981781006, | |
| "learning_rate": 2.6002258610954265e-05, | |
| "loss": 1.2687, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 4.893657067570111, | |
| "grad_norm": 2.993502616882324, | |
| "learning_rate": 2.5531714662149446e-05, | |
| "loss": 1.2439, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.987765857331075, | |
| "grad_norm": 2.6108858585357666, | |
| "learning_rate": 2.5061170713344627e-05, | |
| "loss": 1.2315, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 5.081874647092039, | |
| "grad_norm": 3.422744035720825, | |
| "learning_rate": 2.4590626764539808e-05, | |
| "loss": 1.2216, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 5.175983436853002, | |
| "grad_norm": 2.409943103790283, | |
| "learning_rate": 2.412008281573499e-05, | |
| "loss": 1.214, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 5.270092226613966, | |
| "grad_norm": 2.378814458847046, | |
| "learning_rate": 2.3649538866930173e-05, | |
| "loss": 1.206, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 5.364201016374929, | |
| "grad_norm": 2.6459624767303467, | |
| "learning_rate": 2.3178994918125354e-05, | |
| "loss": 1.2078, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 5.458309806135893, | |
| "grad_norm": 3.1709744930267334, | |
| "learning_rate": 2.2708450969320535e-05, | |
| "loss": 1.1908, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 5.552418595896857, | |
| "grad_norm": 3.47601056098938, | |
| "learning_rate": 2.2237907020515716e-05, | |
| "loss": 1.192, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 5.64652738565782, | |
| "grad_norm": 3.5006909370422363, | |
| "learning_rate": 2.17673630717109e-05, | |
| "loss": 1.1913, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 5.740636175418784, | |
| "grad_norm": 2.8024189472198486, | |
| "learning_rate": 2.129681912290608e-05, | |
| "loss": 1.1815, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 5.834744965179748, | |
| "grad_norm": 4.2450947761535645, | |
| "learning_rate": 2.0826275174101262e-05, | |
| "loss": 1.1793, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 5.928853754940711, | |
| "grad_norm": 3.6110267639160156, | |
| "learning_rate": 2.0355731225296443e-05, | |
| "loss": 1.1734, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 6.0229625447016755, | |
| "grad_norm": 4.730632305145264, | |
| "learning_rate": 1.9885187276491627e-05, | |
| "loss": 1.1447, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 6.117071334462639, | |
| "grad_norm": 3.0380797386169434, | |
| "learning_rate": 1.9414643327686808e-05, | |
| "loss": 1.1206, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 6.211180124223603, | |
| "grad_norm": 4.4165358543396, | |
| "learning_rate": 1.894409937888199e-05, | |
| "loss": 1.1295, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 6.305288913984566, | |
| "grad_norm": 3.3100738525390625, | |
| "learning_rate": 1.847355543007717e-05, | |
| "loss": 1.1214, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 6.39939770374553, | |
| "grad_norm": 4.085879325866699, | |
| "learning_rate": 1.8003011481272354e-05, | |
| "loss": 1.1128, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 6.4935064935064934, | |
| "grad_norm": 3.0867068767547607, | |
| "learning_rate": 1.7532467532467535e-05, | |
| "loss": 1.114, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 6.587615283267457, | |
| "grad_norm": 3.721590757369995, | |
| "learning_rate": 1.7061923583662716e-05, | |
| "loss": 1.1037, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 6.681724073028421, | |
| "grad_norm": 3.4121475219726562, | |
| "learning_rate": 1.6591379634857897e-05, | |
| "loss": 1.107, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 6.775832862789384, | |
| "grad_norm": 3.0844948291778564, | |
| "learning_rate": 1.6120835686053078e-05, | |
| "loss": 1.1005, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 6.869941652550348, | |
| "grad_norm": 3.7161357402801514, | |
| "learning_rate": 1.565029173724826e-05, | |
| "loss": 1.0948, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 6.964050442311311, | |
| "grad_norm": 3.217207193374634, | |
| "learning_rate": 1.5179747788443441e-05, | |
| "loss": 1.0877, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 7.058159232072276, | |
| "grad_norm": 4.4285078048706055, | |
| "learning_rate": 1.4709203839638622e-05, | |
| "loss": 1.0559, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 7.1522680218332395, | |
| "grad_norm": 3.0102875232696533, | |
| "learning_rate": 1.4238659890833805e-05, | |
| "loss": 1.0322, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 7.246376811594203, | |
| "grad_norm": 3.4103572368621826, | |
| "learning_rate": 1.3768115942028985e-05, | |
| "loss": 1.0217, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 7.340485601355167, | |
| "grad_norm": 3.9534976482391357, | |
| "learning_rate": 1.3297571993224166e-05, | |
| "loss": 1.0248, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 7.43459439111613, | |
| "grad_norm": 5.0660719871521, | |
| "learning_rate": 1.2827028044419347e-05, | |
| "loss": 1.0233, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 7.528703180877094, | |
| "grad_norm": 4.05812931060791, | |
| "learning_rate": 1.235648409561453e-05, | |
| "loss": 1.0254, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 7.6228119706380575, | |
| "grad_norm": 2.9366817474365234, | |
| "learning_rate": 1.1885940146809712e-05, | |
| "loss": 1.0154, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 7.716920760399021, | |
| "grad_norm": 3.6969943046569824, | |
| "learning_rate": 1.1415396198004895e-05, | |
| "loss": 1.0023, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 7.811029550159985, | |
| "grad_norm": 3.294569969177246, | |
| "learning_rate": 3.4377940899680034e-05, | |
| "loss": 1.0861, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 7.905138339920948, | |
| "grad_norm": 1.961162805557251, | |
| "learning_rate": 3.418972332015811e-05, | |
| "loss": 1.105, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 7.999247129681912, | |
| "grad_norm": 4.648848056793213, | |
| "learning_rate": 3.400150574063617e-05, | |
| "loss": 1.1171, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 8.093355919442876, | |
| "grad_norm": 4.888842582702637, | |
| "learning_rate": 3.381328816111425e-05, | |
| "loss": 1.0555, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 8.187464709203839, | |
| "grad_norm": 3.3256289958953857, | |
| "learning_rate": 3.362507058159232e-05, | |
| "loss": 1.059, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 8.281573498964804, | |
| "grad_norm": 2.744330406188965, | |
| "learning_rate": 3.3436853002070396e-05, | |
| "loss": 1.0818, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 8.375682288725766, | |
| "grad_norm": 2.7452945709228516, | |
| "learning_rate": 3.3248635422548465e-05, | |
| "loss": 1.0772, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 8.46979107848673, | |
| "grad_norm": 2.993236780166626, | |
| "learning_rate": 3.306041784302654e-05, | |
| "loss": 1.0896, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 8.563899868247695, | |
| "grad_norm": 4.128504276275635, | |
| "learning_rate": 3.287220026350462e-05, | |
| "loss": 1.0806, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 8.658008658008658, | |
| "grad_norm": 3.3763933181762695, | |
| "learning_rate": 3.268398268398268e-05, | |
| "loss": 1.0713, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 8.752117447769622, | |
| "grad_norm": 2.213223457336426, | |
| "learning_rate": 3.249576510446076e-05, | |
| "loss": 1.0698, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 8.846226237530585, | |
| "grad_norm": 3.0853641033172607, | |
| "learning_rate": 3.230754752493883e-05, | |
| "loss": 1.0724, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 8.94033502729155, | |
| "grad_norm": 3.031791925430298, | |
| "learning_rate": 3.2119329945416903e-05, | |
| "loss": 1.0672, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 9.034443817052512, | |
| "grad_norm": 3.0765955448150635, | |
| "learning_rate": 3.193111236589498e-05, | |
| "loss": 1.0545, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 9.128552606813477, | |
| "grad_norm": 4.204183101654053, | |
| "learning_rate": 3.174289478637305e-05, | |
| "loss": 1.0037, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 9.22266139657444, | |
| "grad_norm": 5.794823169708252, | |
| "learning_rate": 3.155467720685112e-05, | |
| "loss": 1.0069, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 9.316770186335404, | |
| "grad_norm": 3.246511459350586, | |
| "learning_rate": 3.136645962732919e-05, | |
| "loss": 1.0151, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 9.410878976096367, | |
| "grad_norm": 2.7034826278686523, | |
| "learning_rate": 3.1178242047807265e-05, | |
| "loss": 1.0183, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 9.504987765857331, | |
| "grad_norm": 2.402880907058716, | |
| "learning_rate": 3.099002446828534e-05, | |
| "loss": 1.0193, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 9.599096555618296, | |
| "grad_norm": 5.0139336585998535, | |
| "learning_rate": 3.080180688876341e-05, | |
| "loss": 1.0145, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 9.693205345379258, | |
| "grad_norm": 3.907541275024414, | |
| "learning_rate": 3.061358930924149e-05, | |
| "loss": 1.011, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 9.787314135140223, | |
| "grad_norm": 4.411949157714844, | |
| "learning_rate": 3.042537172971956e-05, | |
| "loss": 1.0063, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 9.881422924901186, | |
| "grad_norm": 4.035211563110352, | |
| "learning_rate": 3.0237154150197627e-05, | |
| "loss": 1.0149, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 9.97553171466215, | |
| "grad_norm": 3.6050124168395996, | |
| "learning_rate": 3.00489365706757e-05, | |
| "loss": 1.0027, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 10.069640504423113, | |
| "grad_norm": 3.40191388130188, | |
| "learning_rate": 2.9860718991153773e-05, | |
| "loss": 0.9468, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 10.163749294184077, | |
| "grad_norm": 2.8000032901763916, | |
| "learning_rate": 2.9672501411631846e-05, | |
| "loss": 0.9248, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 10.25785808394504, | |
| "grad_norm": 4.177460193634033, | |
| "learning_rate": 2.9484283832109923e-05, | |
| "loss": 0.9389, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 10.351966873706004, | |
| "grad_norm": 3.88246750831604, | |
| "learning_rate": 2.9296066252587996e-05, | |
| "loss": 0.9326, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 10.446075663466967, | |
| "grad_norm": 4.002796173095703, | |
| "learning_rate": 2.910784867306607e-05, | |
| "loss": 0.9378, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 10.540184453227932, | |
| "grad_norm": 4.069864749908447, | |
| "learning_rate": 2.8919631093544135e-05, | |
| "loss": 0.9405, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 10.634293242988894, | |
| "grad_norm": 3.493865966796875, | |
| "learning_rate": 2.8731413514022208e-05, | |
| "loss": 0.9347, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 10.728402032749859, | |
| "grad_norm": 3.725019693374634, | |
| "learning_rate": 2.8543195934500284e-05, | |
| "loss": 0.9428, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 10.822510822510823, | |
| "grad_norm": 3.557591438293457, | |
| "learning_rate": 2.8354978354978357e-05, | |
| "loss": 0.9397, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 10.916619612271786, | |
| "grad_norm": 3.370312213897705, | |
| "learning_rate": 2.816676077545643e-05, | |
| "loss": 0.939, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 11.01072840203275, | |
| "grad_norm": 2.937887191772461, | |
| "learning_rate": 2.7978543195934503e-05, | |
| "loss": 0.9225, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 11.104837191793713, | |
| "grad_norm": 2.403350353240967, | |
| "learning_rate": 2.7790325616412576e-05, | |
| "loss": 0.827, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 11.198945981554678, | |
| "grad_norm": 2.4876935482025146, | |
| "learning_rate": 2.7602108036890646e-05, | |
| "loss": 0.8461, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 11.29305477131564, | |
| "grad_norm": 3.663511276245117, | |
| "learning_rate": 2.741389045736872e-05, | |
| "loss": 0.8434, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 11.387163561076605, | |
| "grad_norm": 4.3568806648254395, | |
| "learning_rate": 2.7225672877846792e-05, | |
| "loss": 0.8496, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 11.481272350837568, | |
| "grad_norm": 3.4234845638275146, | |
| "learning_rate": 2.7037455298324865e-05, | |
| "loss": 0.8549, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 11.575381140598532, | |
| "grad_norm": 2.537666082382202, | |
| "learning_rate": 2.6849237718802938e-05, | |
| "loss": 0.8548, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 11.669489930359495, | |
| "grad_norm": 3.7678942680358887, | |
| "learning_rate": 2.666102013928101e-05, | |
| "loss": 0.8501, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 11.76359872012046, | |
| "grad_norm": 3.4384429454803467, | |
| "learning_rate": 2.647280255975908e-05, | |
| "loss": 0.8546, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 11.857707509881424, | |
| "grad_norm": 2.883970022201538, | |
| "learning_rate": 2.6284584980237154e-05, | |
| "loss": 0.8494, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 11.951816299642386, | |
| "grad_norm": 3.2041871547698975, | |
| "learning_rate": 2.6096367400715227e-05, | |
| "loss": 0.8473, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 12.045925089403351, | |
| "grad_norm": 3.7006897926330566, | |
| "learning_rate": 2.59081498211933e-05, | |
| "loss": 0.7851, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 12.140033879164314, | |
| "grad_norm": 3.4774985313415527, | |
| "learning_rate": 2.5719932241671373e-05, | |
| "loss": 0.7241, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 12.234142668925278, | |
| "grad_norm": 3.615403413772583, | |
| "learning_rate": 2.5531714662149446e-05, | |
| "loss": 0.7296, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 12.32825145868624, | |
| "grad_norm": 4.831261157989502, | |
| "learning_rate": 2.534349708262752e-05, | |
| "loss": 0.7382, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 12.422360248447205, | |
| "grad_norm": 3.072159767150879, | |
| "learning_rate": 2.515527950310559e-05, | |
| "loss": 0.733, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 12.516469038208168, | |
| "grad_norm": 3.442324161529541, | |
| "learning_rate": 2.4967061923583665e-05, | |
| "loss": 0.7453, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 12.610577827969133, | |
| "grad_norm": 2.71148419380188, | |
| "learning_rate": 2.4778844344061735e-05, | |
| "loss": 0.7388, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 12.704686617730095, | |
| "grad_norm": 3.6085212230682373, | |
| "learning_rate": 2.4590626764539808e-05, | |
| "loss": 0.7444, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 12.79879540749106, | |
| "grad_norm": 3.9403274059295654, | |
| "learning_rate": 2.440240918501788e-05, | |
| "loss": 0.7291, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 12.892904197252022, | |
| "grad_norm": 3.322840929031372, | |
| "learning_rate": 2.4214191605495954e-05, | |
| "loss": 0.7523, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 12.987012987012987, | |
| "grad_norm": 4.849690914154053, | |
| "learning_rate": 2.4025974025974027e-05, | |
| "loss": 0.7477, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 13.081121776773951, | |
| "grad_norm": 4.5327959060668945, | |
| "learning_rate": 2.38377564464521e-05, | |
| "loss": 0.5957, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 13.175230566534914, | |
| "grad_norm": 2.9526569843292236, | |
| "learning_rate": 2.3649538866930173e-05, | |
| "loss": 0.5829, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 13.269339356295879, | |
| "grad_norm": 2.7245306968688965, | |
| "learning_rate": 2.3461321287408243e-05, | |
| "loss": 0.5948, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 13.363448146056841, | |
| "grad_norm": 3.7524588108062744, | |
| "learning_rate": 2.327310370788632e-05, | |
| "loss": 0.5956, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 13.457556935817806, | |
| "grad_norm": 4.387008190155029, | |
| "learning_rate": 2.3084886128364392e-05, | |
| "loss": 0.605, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 13.551665725578768, | |
| "grad_norm": 3.449723482131958, | |
| "learning_rate": 2.2896668548842462e-05, | |
| "loss": 0.6087, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 13.645774515339733, | |
| "grad_norm": 3.2176854610443115, | |
| "learning_rate": 2.2708450969320535e-05, | |
| "loss": 0.6141, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 13.739883305100696, | |
| "grad_norm": 4.136612415313721, | |
| "learning_rate": 2.2520233389798608e-05, | |
| "loss": 0.6251, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 13.83399209486166, | |
| "grad_norm": 4.145909786224365, | |
| "learning_rate": 2.233201581027668e-05, | |
| "loss": 0.6094, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 13.928100884622623, | |
| "grad_norm": 3.8660261631011963, | |
| "learning_rate": 2.2143798230754754e-05, | |
| "loss": 0.6194, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 14.022209674383587, | |
| "grad_norm": 3.511117696762085, | |
| "learning_rate": 2.1955580651232827e-05, | |
| "loss": 0.5712, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 14.116318464144552, | |
| "grad_norm": 2.771902084350586, | |
| "learning_rate": 2.17673630717109e-05, | |
| "loss": 0.4559, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 14.210427253905515, | |
| "grad_norm": 5.087037086486816, | |
| "learning_rate": 2.157914549218897e-05, | |
| "loss": 0.4529, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 14.304536043666479, | |
| "grad_norm": 4.842881679534912, | |
| "learning_rate": 2.1390927912667043e-05, | |
| "loss": 0.4567, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 14.398644833427442, | |
| "grad_norm": 3.444424629211426, | |
| "learning_rate": 2.120271033314512e-05, | |
| "loss": 0.4618, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 14.492753623188406, | |
| "grad_norm": 3.934549331665039, | |
| "learning_rate": 2.101449275362319e-05, | |
| "loss": 0.455, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 14.586862412949369, | |
| "grad_norm": 4.338078498840332, | |
| "learning_rate": 2.0826275174101262e-05, | |
| "loss": 0.472, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 14.680971202710333, | |
| "grad_norm": 3.781182050704956, | |
| "learning_rate": 2.0638057594579335e-05, | |
| "loss": 0.4613, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 14.775079992471296, | |
| "grad_norm": 3.866917371749878, | |
| "learning_rate": 2.0449840015057405e-05, | |
| "loss": 0.4791, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 14.86918878223226, | |
| "grad_norm": 3.5265841484069824, | |
| "learning_rate": 2.026162243553548e-05, | |
| "loss": 0.4709, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 14.963297571993223, | |
| "grad_norm": 3.3175408840179443, | |
| "learning_rate": 2.0073404856013554e-05, | |
| "loss": 0.4847, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 15.057406361754188, | |
| "grad_norm": 2.848555564880371, | |
| "learning_rate": 1.9885187276491627e-05, | |
| "loss": 0.3788, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 15.151515151515152, | |
| "grad_norm": 3.355653762817383, | |
| "learning_rate": 1.9696969696969697e-05, | |
| "loss": 0.3235, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 15.245623941276115, | |
| "grad_norm": 3.260960817337036, | |
| "learning_rate": 1.950875211744777e-05, | |
| "loss": 0.3162, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 15.33973273103708, | |
| "grad_norm": 3.9132673740386963, | |
| "learning_rate": 1.9320534537925843e-05, | |
| "loss": 0.3209, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 15.433841520798042, | |
| "grad_norm": 4.693389892578125, | |
| "learning_rate": 1.9132316958403916e-05, | |
| "loss": 0.3421, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 15.527950310559007, | |
| "grad_norm": 3.7237417697906494, | |
| "learning_rate": 1.894409937888199e-05, | |
| "loss": 0.3441, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 15.62205910031997, | |
| "grad_norm": 3.9301464557647705, | |
| "learning_rate": 1.8755881799360062e-05, | |
| "loss": 0.3424, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 15.716167890080934, | |
| "grad_norm": 4.186377048492432, | |
| "learning_rate": 1.8567664219838135e-05, | |
| "loss": 0.3503, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 15.810276679841897, | |
| "grad_norm": 3.6368534564971924, | |
| "learning_rate": 1.8379446640316205e-05, | |
| "loss": 0.3347, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 15.904385469602861, | |
| "grad_norm": 2.6787190437316895, | |
| "learning_rate": 1.8191229060794278e-05, | |
| "loss": 0.3445, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 15.998494259363824, | |
| "grad_norm": 3.942444324493408, | |
| "learning_rate": 1.8003011481272354e-05, | |
| "loss": 0.343, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 16.09260304912479, | |
| "grad_norm": 3.3087995052337646, | |
| "learning_rate": 1.7814793901750424e-05, | |
| "loss": 0.2093, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 16.186711838885753, | |
| "grad_norm": 4.419996738433838, | |
| "learning_rate": 1.7626576322228497e-05, | |
| "loss": 0.2209, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 16.280820628646715, | |
| "grad_norm": 4.352416515350342, | |
| "learning_rate": 1.743835874270657e-05, | |
| "loss": 0.2282, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 16.374929418407678, | |
| "grad_norm": 3.7437074184417725, | |
| "learning_rate": 1.725014116318464e-05, | |
| "loss": 0.232, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 16.469038208168644, | |
| "grad_norm": 6.4077534675598145, | |
| "learning_rate": 1.7061923583662716e-05, | |
| "loss": 0.2291, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 16.563146997929607, | |
| "grad_norm": 3.0419087409973145, | |
| "learning_rate": 1.687370600414079e-05, | |
| "loss": 0.2401, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 16.65725578769057, | |
| "grad_norm": 4.095304012298584, | |
| "learning_rate": 1.6685488424618862e-05, | |
| "loss": 0.233, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 16.751364577451533, | |
| "grad_norm": 3.372295618057251, | |
| "learning_rate": 1.649727084509693e-05, | |
| "loss": 0.2354, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 16.8454733672125, | |
| "grad_norm": 2.918405771255493, | |
| "learning_rate": 1.6309053265575005e-05, | |
| "loss": 0.2421, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 16.93958215697346, | |
| "grad_norm": 2.4548110961914062, | |
| "learning_rate": 1.6120835686053078e-05, | |
| "loss": 0.2374, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 17.033690946734424, | |
| "grad_norm": 4.15058708190918, | |
| "learning_rate": 1.593261810653115e-05, | |
| "loss": 0.2135, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 17.12779973649539, | |
| "grad_norm": 2.979170083999634, | |
| "learning_rate": 1.5744400527009224e-05, | |
| "loss": 0.1495, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 17.221908526256353, | |
| "grad_norm": 3.950493574142456, | |
| "learning_rate": 1.5556182947487297e-05, | |
| "loss": 0.1503, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 17.316017316017316, | |
| "grad_norm": 3.174896001815796, | |
| "learning_rate": 1.5367965367965366e-05, | |
| "loss": 0.1559, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 17.41012610577828, | |
| "grad_norm": 3.4672741889953613, | |
| "learning_rate": 1.5179747788443441e-05, | |
| "loss": 0.1606, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 17.504234895539245, | |
| "grad_norm": 3.950160026550293, | |
| "learning_rate": 1.4991530208921514e-05, | |
| "loss": 0.1561, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 17.598343685300208, | |
| "grad_norm": 3.546109199523926, | |
| "learning_rate": 1.4803312629399587e-05, | |
| "loss": 0.1622, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 17.69245247506117, | |
| "grad_norm": 6.592913627624512, | |
| "learning_rate": 1.4615095049877658e-05, | |
| "loss": 0.1644, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 17.786561264822133, | |
| "grad_norm": 2.762545347213745, | |
| "learning_rate": 1.4426877470355732e-05, | |
| "loss": 0.1614, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 17.8806700545831, | |
| "grad_norm": 2.9720265865325928, | |
| "learning_rate": 1.4238659890833805e-05, | |
| "loss": 0.1577, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 17.974778844344062, | |
| "grad_norm": 3.006880283355713, | |
| "learning_rate": 1.4050442311311876e-05, | |
| "loss": 0.1577, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 18.068887634105025, | |
| "grad_norm": 2.86936616897583, | |
| "learning_rate": 1.3862224731789949e-05, | |
| "loss": 0.1162, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 18.162996423865987, | |
| "grad_norm": 3.608914852142334, | |
| "learning_rate": 1.3674007152268024e-05, | |
| "loss": 0.1001, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 18.257105213626954, | |
| "grad_norm": 3.126116991043091, | |
| "learning_rate": 1.3485789572746097e-05, | |
| "loss": 0.1082, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 18.351214003387916, | |
| "grad_norm": 2.9935250282287598, | |
| "learning_rate": 1.3297571993224166e-05, | |
| "loss": 0.109, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 18.44532279314888, | |
| "grad_norm": 1.9776825904846191, | |
| "learning_rate": 1.3109354413702241e-05, | |
| "loss": 0.1031, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 18.539431582909845, | |
| "grad_norm": 4.182958602905273, | |
| "learning_rate": 1.2921136834180314e-05, | |
| "loss": 0.1084, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 18.633540372670808, | |
| "grad_norm": 3.405510902404785, | |
| "learning_rate": 1.2732919254658385e-05, | |
| "loss": 0.1069, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 18.72764916243177, | |
| "grad_norm": 2.7036936283111572, | |
| "learning_rate": 1.2544701675136458e-05, | |
| "loss": 0.1084, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 18.821757952192733, | |
| "grad_norm": 2.601555347442627, | |
| "learning_rate": 1.235648409561453e-05, | |
| "loss": 0.1097, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 18.9158667419537, | |
| "grad_norm": 2.9937756061553955, | |
| "learning_rate": 1.2168266516092605e-05, | |
| "loss": 0.1004, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 19.009975531714662, | |
| "grad_norm": 1.493790864944458, | |
| "learning_rate": 1.1980048936570676e-05, | |
| "loss": 0.1032, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 19.104084321475625, | |
| "grad_norm": 1.7063292264938354, | |
| "learning_rate": 1.1791831357048749e-05, | |
| "loss": 0.0667, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 19.198193111236588, | |
| "grad_norm": 2.8355624675750732, | |
| "learning_rate": 1.1603613777526822e-05, | |
| "loss": 0.0678, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 19.292301900997554, | |
| "grad_norm": 1.7610359191894531, | |
| "learning_rate": 1.1415396198004895e-05, | |
| "loss": 0.0654, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 19.386410690758517, | |
| "grad_norm": 2.759197950363159, | |
| "learning_rate": 1.1227178618482966e-05, | |
| "loss": 0.0717, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 19.48051948051948, | |
| "grad_norm": 2.657435417175293, | |
| "learning_rate": 1.103896103896104e-05, | |
| "loss": 0.0722, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 19.574628270280446, | |
| "grad_norm": 2.5865299701690674, | |
| "learning_rate": 1.0850743459439112e-05, | |
| "loss": 0.0689, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 19.66873706004141, | |
| "grad_norm": 4.484961986541748, | |
| "learning_rate": 1.0662525879917184e-05, | |
| "loss": 0.0706, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 19.76284584980237, | |
| "grad_norm": 2.431190252304077, | |
| "learning_rate": 1.0474308300395258e-05, | |
| "loss": 0.0691, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 19.856954639563334, | |
| "grad_norm": 2.6577088832855225, | |
| "learning_rate": 1.028609072087333e-05, | |
| "loss": 0.0704, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 19.9510634293243, | |
| "grad_norm": 3.1382012367248535, | |
| "learning_rate": 1.0097873141351403e-05, | |
| "loss": 0.0673, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 20.045172219085263, | |
| "grad_norm": 1.5920716524124146, | |
| "learning_rate": 9.909655561829476e-06, | |
| "loss": 0.0605, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 20.139281008846226, | |
| "grad_norm": 0.49824026226997375, | |
| "learning_rate": 9.721437982307547e-06, | |
| "loss": 0.0427, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 20.23338979860719, | |
| "grad_norm": 1.5648902654647827, | |
| "learning_rate": 9.533220402785622e-06, | |
| "loss": 0.042, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 20.327498588368154, | |
| "grad_norm": 2.084714889526367, | |
| "learning_rate": 9.345002823263693e-06, | |
| "loss": 0.0442, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 20.421607378129117, | |
| "grad_norm": 2.1811583042144775, | |
| "learning_rate": 9.156785243741765e-06, | |
| "loss": 0.0424, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 20.51571616789008, | |
| "grad_norm": 3.9864232540130615, | |
| "learning_rate": 8.96856766421984e-06, | |
| "loss": 0.0439, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 20.609824957651046, | |
| "grad_norm": 4.495816707611084, | |
| "learning_rate": 8.78035008469791e-06, | |
| "loss": 0.0433, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 20.70393374741201, | |
| "grad_norm": 1.6480516195297241, | |
| "learning_rate": 8.592132505175984e-06, | |
| "loss": 0.0454, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 20.79804253717297, | |
| "grad_norm": 1.1842634677886963, | |
| "learning_rate": 8.403914925654057e-06, | |
| "loss": 0.0425, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 20.892151326933934, | |
| "grad_norm": 4.60665225982666, | |
| "learning_rate": 8.215697346132128e-06, | |
| "loss": 0.0419, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 20.9862601166949, | |
| "grad_norm": 3.4669153690338135, | |
| "learning_rate": 8.027479766610201e-06, | |
| "loss": 0.0412, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 21.080368906455863, | |
| "grad_norm": 0.7074203491210938, | |
| "learning_rate": 7.839262187088274e-06, | |
| "loss": 0.0285, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 21.174477696216826, | |
| "grad_norm": 2.58770489692688, | |
| "learning_rate": 7.651044607566347e-06, | |
| "loss": 0.0275, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 21.26858648597779, | |
| "grad_norm": 0.6419113874435425, | |
| "learning_rate": 7.462827028044419e-06, | |
| "loss": 0.0263, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 21.362695275738755, | |
| "grad_norm": 2.158191204071045, | |
| "learning_rate": 7.274609448522493e-06, | |
| "loss": 0.0267, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 21.456804065499718, | |
| "grad_norm": 1.1450761556625366, | |
| "learning_rate": 7.0863918690005655e-06, | |
| "loss": 0.024, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 21.55091285526068, | |
| "grad_norm": 0.9204089045524597, | |
| "learning_rate": 6.898174289478637e-06, | |
| "loss": 0.0273, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 21.645021645021647, | |
| "grad_norm": 1.3897809982299805, | |
| "learning_rate": 6.709956709956711e-06, | |
| "loss": 0.0276, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 21.73913043478261, | |
| "grad_norm": 2.818786382675171, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.027, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 21.833239224543572, | |
| "grad_norm": 1.8107503652572632, | |
| "learning_rate": 6.333521550912856e-06, | |
| "loss": 0.025, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 21.927348014304535, | |
| "grad_norm": 2.591801881790161, | |
| "learning_rate": 6.145303971390928e-06, | |
| "loss": 0.0254, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 22.0214568040655, | |
| "grad_norm": 1.22541344165802, | |
| "learning_rate": 5.957086391869001e-06, | |
| "loss": 0.0207, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 22.115565593826464, | |
| "grad_norm": 2.6778624057769775, | |
| "learning_rate": 5.768868812347074e-06, | |
| "loss": 0.0148, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 22.209674383587426, | |
| "grad_norm": 1.2167950868606567, | |
| "learning_rate": 5.5806512328251455e-06, | |
| "loss": 0.0134, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 22.30378317334839, | |
| "grad_norm": 0.1222626119852066, | |
| "learning_rate": 5.3924336533032186e-06, | |
| "loss": 0.0175, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 22.397891963109355, | |
| "grad_norm": 0.3822714686393738, | |
| "learning_rate": 5.204216073781292e-06, | |
| "loss": 0.0156, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 22.492000752870318, | |
| "grad_norm": 0.542395293712616, | |
| "learning_rate": 5.015998494259365e-06, | |
| "loss": 0.0141, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 22.58610954263128, | |
| "grad_norm": 1.051392674446106, | |
| "learning_rate": 4.827780914737437e-06, | |
| "loss": 0.0138, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 22.680218332392247, | |
| "grad_norm": 1.7265046834945679, | |
| "learning_rate": 4.639563335215509e-06, | |
| "loss": 0.0139, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 22.77432712215321, | |
| "grad_norm": 0.8157036304473877, | |
| "learning_rate": 4.451345755693582e-06, | |
| "loss": 0.0147, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 22.868435911914172, | |
| "grad_norm": 2.212116003036499, | |
| "learning_rate": 4.263128176171654e-06, | |
| "loss": 0.0157, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 22.962544701675135, | |
| "grad_norm": 1.5762394666671753, | |
| "learning_rate": 4.074910596649727e-06, | |
| "loss": 0.0139, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 23.0566534914361, | |
| "grad_norm": 0.16070736944675446, | |
| "learning_rate": 3.8866930171278e-06, | |
| "loss": 0.0113, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 23.150762281197064, | |
| "grad_norm": 0.6197986602783203, | |
| "learning_rate": 3.698475437605872e-06, | |
| "loss": 0.0066, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 23.244871070958027, | |
| "grad_norm": 0.2145221084356308, | |
| "learning_rate": 3.510257858083945e-06, | |
| "loss": 0.0061, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 23.33897986071899, | |
| "grad_norm": 0.1779479682445526, | |
| "learning_rate": 3.3220402785620177e-06, | |
| "loss": 0.0059, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 23.433088650479956, | |
| "grad_norm": 0.18615959584712982, | |
| "learning_rate": 3.1338226990400907e-06, | |
| "loss": 0.0057, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 23.52719744024092, | |
| "grad_norm": 1.9545246362686157, | |
| "learning_rate": 2.945605119518163e-06, | |
| "loss": 0.0067, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 23.62130623000188, | |
| "grad_norm": 2.255216598510742, | |
| "learning_rate": 2.7573875399962355e-06, | |
| "loss": 0.0062, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 23.715415019762847, | |
| "grad_norm": 0.28629258275032043, | |
| "learning_rate": 2.5691699604743086e-06, | |
| "loss": 0.0054, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 23.80952380952381, | |
| "grad_norm": 0.1650991588830948, | |
| "learning_rate": 2.3809523809523808e-06, | |
| "loss": 0.0056, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 23.903632599284773, | |
| "grad_norm": 0.45735275745391846, | |
| "learning_rate": 2.192734801430454e-06, | |
| "loss": 0.006, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 23.997741389045736, | |
| "grad_norm": 1.3017574548721313, | |
| "learning_rate": 2.0045172219085264e-06, | |
| "loss": 0.0047, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 24.091850178806702, | |
| "grad_norm": 0.15063992142677307, | |
| "learning_rate": 1.8162996423865988e-06, | |
| "loss": 0.0037, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 24.185958968567665, | |
| "grad_norm": 0.09415856748819351, | |
| "learning_rate": 1.6280820628646716e-06, | |
| "loss": 0.0021, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 24.280067758328627, | |
| "grad_norm": 0.06203685700893402, | |
| "learning_rate": 1.4398644833427442e-06, | |
| "loss": 0.0017, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 24.37417654808959, | |
| "grad_norm": 1.0590204000473022, | |
| "learning_rate": 1.2516469038208169e-06, | |
| "loss": 0.0016, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 24.468285337850556, | |
| "grad_norm": 0.05048515647649765, | |
| "learning_rate": 1.0634293242988897e-06, | |
| "loss": 0.0014, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 24.56239412761152, | |
| "grad_norm": 0.10998225957155228, | |
| "learning_rate": 8.752117447769622e-07, | |
| "loss": 0.0014, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 24.65650291737248, | |
| "grad_norm": 0.03882027417421341, | |
| "learning_rate": 6.869941652550348e-07, | |
| "loss": 0.0013, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 24.750611707133448, | |
| "grad_norm": 0.09357800334692001, | |
| "learning_rate": 4.987765857331075e-07, | |
| "loss": 0.0016, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 24.84472049689441, | |
| "grad_norm": 0.18081815540790558, | |
| "learning_rate": 3.1055900621118013e-07, | |
| "loss": 0.0016, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 24.938829286655373, | |
| "grad_norm": 0.04123268648982048, | |
| "learning_rate": 1.223414266892528e-07, | |
| "loss": 0.0025, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "step": 132825, | |
| "total_flos": 2.908084043402707e+18, | |
| "train_loss": 0.022898558901232692, | |
| "train_runtime": 71987.3815, | |
| "train_samples_per_second": 29.519, | |
| "train_steps_per_second": 1.845 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 132825, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.908084043402707e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |