Sp2503's picture
Upload 16 files
b9cd251 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 11814,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012698412698412698,
"grad_norm": 0.18539635837078094,
"learning_rate": 0.00019917047570678858,
"loss": 5.4804,
"step": 50
},
{
"epoch": 0.025396825396825397,
"grad_norm": 0.19653430581092834,
"learning_rate": 0.00019832402234636873,
"loss": 5.4815,
"step": 100
},
{
"epoch": 0.0380952380952381,
"grad_norm": 0.21583063900470734,
"learning_rate": 0.0001974775689859489,
"loss": 5.4752,
"step": 150
},
{
"epoch": 0.050793650793650794,
"grad_norm": 0.659081757068634,
"learning_rate": 0.00019663111562552904,
"loss": 5.3388,
"step": 200
},
{
"epoch": 0.06349206349206349,
"grad_norm": 1.1028743982315063,
"learning_rate": 0.00019578466226510922,
"loss": 5.0174,
"step": 250
},
{
"epoch": 0.0761904761904762,
"grad_norm": 1.1611863374710083,
"learning_rate": 0.00019493820890468936,
"loss": 4.9543,
"step": 300
},
{
"epoch": 0.08888888888888889,
"grad_norm": 1.3357131481170654,
"learning_rate": 0.00019409175554426953,
"loss": 4.8991,
"step": 350
},
{
"epoch": 0.10158730158730159,
"grad_norm": 1.320380687713623,
"learning_rate": 0.00019324530218384968,
"loss": 4.889,
"step": 400
},
{
"epoch": 0.11428571428571428,
"grad_norm": 1.5646241903305054,
"learning_rate": 0.00019239884882342985,
"loss": 4.8641,
"step": 450
},
{
"epoch": 0.12698412698412698,
"grad_norm": 1.2670501470565796,
"learning_rate": 0.00019155239546301,
"loss": 4.8605,
"step": 500
},
{
"epoch": 0.13968253968253969,
"grad_norm": 1.3732168674468994,
"learning_rate": 0.00019070594210259017,
"loss": 4.8206,
"step": 550
},
{
"epoch": 0.1523809523809524,
"grad_norm": 1.7774670124053955,
"learning_rate": 0.0001898594887421703,
"loss": 4.8106,
"step": 600
},
{
"epoch": 0.16507936507936508,
"grad_norm": 1.8740317821502686,
"learning_rate": 0.00018901303538175048,
"loss": 4.8007,
"step": 650
},
{
"epoch": 0.17777777777777778,
"grad_norm": 1.6368571519851685,
"learning_rate": 0.00018816658202133063,
"loss": 4.7815,
"step": 700
},
{
"epoch": 0.19047619047619047,
"grad_norm": 1.5557011365890503,
"learning_rate": 0.0001873201286609108,
"loss": 4.7716,
"step": 750
},
{
"epoch": 0.20317460317460317,
"grad_norm": 1.6413705348968506,
"learning_rate": 0.00018647367530049097,
"loss": 4.767,
"step": 800
},
{
"epoch": 0.21587301587301588,
"grad_norm": 1.6060304641723633,
"learning_rate": 0.00018562722194007112,
"loss": 4.7457,
"step": 850
},
{
"epoch": 0.22857142857142856,
"grad_norm": 1.832118034362793,
"learning_rate": 0.0001847807685796513,
"loss": 4.739,
"step": 900
},
{
"epoch": 0.24126984126984127,
"grad_norm": 1.7070050239562988,
"learning_rate": 0.00018393431521923143,
"loss": 4.7319,
"step": 950
},
{
"epoch": 0.25396825396825395,
"grad_norm": 1.768761157989502,
"learning_rate": 0.0001830878618588116,
"loss": 4.7296,
"step": 1000
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.7824410200119019,
"learning_rate": 0.00018224140849839175,
"loss": 4.7069,
"step": 1050
},
{
"epoch": 0.27936507936507937,
"grad_norm": 1.8590071201324463,
"learning_rate": 0.00018139495513797192,
"loss": 4.6814,
"step": 1100
},
{
"epoch": 0.2920634920634921,
"grad_norm": 1.8381578922271729,
"learning_rate": 0.00018054850177755206,
"loss": 4.6146,
"step": 1150
},
{
"epoch": 0.3047619047619048,
"grad_norm": 1.8397525548934937,
"learning_rate": 0.00017970204841713224,
"loss": 4.5086,
"step": 1200
},
{
"epoch": 0.31746031746031744,
"grad_norm": 1.8704237937927246,
"learning_rate": 0.00017885559505671238,
"loss": 4.376,
"step": 1250
},
{
"epoch": 0.33015873015873015,
"grad_norm": 2.0715010166168213,
"learning_rate": 0.00017800914169629255,
"loss": 4.2503,
"step": 1300
},
{
"epoch": 0.34285714285714286,
"grad_norm": 3.124469041824341,
"learning_rate": 0.0001771626883358727,
"loss": 4.157,
"step": 1350
},
{
"epoch": 0.35555555555555557,
"grad_norm": 2.0155253410339355,
"learning_rate": 0.00017631623497545287,
"loss": 4.1085,
"step": 1400
},
{
"epoch": 0.3682539682539683,
"grad_norm": 2.309039354324341,
"learning_rate": 0.00017546978161503301,
"loss": 4.043,
"step": 1450
},
{
"epoch": 0.38095238095238093,
"grad_norm": 2.6701300144195557,
"learning_rate": 0.0001746233282546132,
"loss": 4.016,
"step": 1500
},
{
"epoch": 0.39365079365079364,
"grad_norm": NaN,
"learning_rate": 0.00017377687489419333,
"loss": 3.9352,
"step": 1550
},
{
"epoch": 0.40634920634920635,
"grad_norm": 2.3252177238464355,
"learning_rate": 0.00017294735060098188,
"loss": 3.9148,
"step": 1600
},
{
"epoch": 0.41904761904761906,
"grad_norm": 2.336137533187866,
"learning_rate": 0.00017210089724056205,
"loss": 3.9097,
"step": 1650
},
{
"epoch": 0.43174603174603177,
"grad_norm": 3.3141894340515137,
"learning_rate": 0.0001712544438801422,
"loss": 3.8494,
"step": 1700
},
{
"epoch": 0.4444444444444444,
"grad_norm": 2.5842506885528564,
"learning_rate": 0.0001704079905197224,
"loss": 3.8192,
"step": 1750
},
{
"epoch": 0.45714285714285713,
"grad_norm": 2.667503833770752,
"learning_rate": 0.00016956153715930254,
"loss": 3.835,
"step": 1800
},
{
"epoch": 0.46984126984126984,
"grad_norm": 2.7759885787963867,
"learning_rate": 0.0001687150837988827,
"loss": 3.7916,
"step": 1850
},
{
"epoch": 0.48253968253968255,
"grad_norm": 2.939075231552124,
"learning_rate": 0.00016786863043846285,
"loss": 3.7805,
"step": 1900
},
{
"epoch": 0.49523809523809526,
"grad_norm": 3.0116183757781982,
"learning_rate": 0.00016702217707804303,
"loss": 3.7436,
"step": 1950
},
{
"epoch": 0.5079365079365079,
"grad_norm": 2.9771008491516113,
"learning_rate": 0.00016617572371762317,
"loss": 3.7206,
"step": 2000
},
{
"epoch": 0.5206349206349207,
"grad_norm": 3.095003843307495,
"learning_rate": 0.00016532927035720334,
"loss": 3.7194,
"step": 2050
},
{
"epoch": 0.5333333333333333,
"grad_norm": 3.0788674354553223,
"learning_rate": 0.0001644828169967835,
"loss": 3.6632,
"step": 2100
},
{
"epoch": 0.546031746031746,
"grad_norm": 3.467590093612671,
"learning_rate": 0.00016363636363636366,
"loss": 3.6919,
"step": 2150
},
{
"epoch": 0.5587301587301587,
"grad_norm": 3.4599721431732178,
"learning_rate": 0.0001627899102759438,
"loss": 3.6657,
"step": 2200
},
{
"epoch": 0.5714285714285714,
"grad_norm": 3.194898843765259,
"learning_rate": 0.00016194345691552398,
"loss": 3.6454,
"step": 2250
},
{
"epoch": 0.5841269841269842,
"grad_norm": 3.5158746242523193,
"learning_rate": 0.00016109700355510412,
"loss": 3.6003,
"step": 2300
},
{
"epoch": 0.5968253968253968,
"grad_norm": 3.2812819480895996,
"learning_rate": 0.0001602505501946843,
"loss": 3.5907,
"step": 2350
},
{
"epoch": 0.6095238095238096,
"grad_norm": 3.5167620182037354,
"learning_rate": 0.00015940409683426444,
"loss": 3.5821,
"step": 2400
},
{
"epoch": 0.6222222222222222,
"grad_norm": 3.5780296325683594,
"learning_rate": 0.0001585576434738446,
"loss": 3.5245,
"step": 2450
},
{
"epoch": 0.6349206349206349,
"grad_norm": 4.31138277053833,
"learning_rate": 0.00015771119011342478,
"loss": 3.5197,
"step": 2500
},
{
"epoch": 0.6476190476190476,
"grad_norm": 3.752105236053467,
"learning_rate": 0.00015686473675300493,
"loss": 3.5104,
"step": 2550
},
{
"epoch": 0.6603174603174603,
"grad_norm": 3.916947603225708,
"learning_rate": 0.0001560182833925851,
"loss": 3.5123,
"step": 2600
},
{
"epoch": 0.6730158730158731,
"grad_norm": 3.912555694580078,
"learning_rate": 0.00015517183003216524,
"loss": 3.4689,
"step": 2650
},
{
"epoch": 0.6857142857142857,
"grad_norm": 4.15084981918335,
"learning_rate": 0.00015432537667174541,
"loss": 3.4572,
"step": 2700
},
{
"epoch": 0.6984126984126984,
"grad_norm": 4.125711441040039,
"learning_rate": 0.00015347892331132556,
"loss": 3.4323,
"step": 2750
},
{
"epoch": 0.7111111111111111,
"grad_norm": 3.8961021900177,
"learning_rate": 0.00015263246995090573,
"loss": 3.4215,
"step": 2800
},
{
"epoch": 0.7238095238095238,
"grad_norm": 4.138737678527832,
"learning_rate": 0.00015178601659048588,
"loss": 3.4113,
"step": 2850
},
{
"epoch": 0.7365079365079366,
"grad_norm": 3.9136457443237305,
"learning_rate": 0.00015093956323006605,
"loss": 3.4036,
"step": 2900
},
{
"epoch": 0.7492063492063492,
"grad_norm": 4.833474636077881,
"learning_rate": 0.0001500931098696462,
"loss": 3.3733,
"step": 2950
},
{
"epoch": 0.7619047619047619,
"grad_norm": 3.974536657333374,
"learning_rate": 0.00014924665650922636,
"loss": 3.3637,
"step": 3000
},
{
"epoch": 0.7746031746031746,
"grad_norm": 3.9956250190734863,
"learning_rate": 0.0001484002031488065,
"loss": 3.3421,
"step": 3050
},
{
"epoch": 0.7873015873015873,
"grad_norm": 4.232203483581543,
"learning_rate": 0.00014755374978838668,
"loss": 3.3564,
"step": 3100
},
{
"epoch": 0.8,
"grad_norm": 4.4057207107543945,
"learning_rate": 0.00014670729642796683,
"loss": 3.3156,
"step": 3150
},
{
"epoch": 0.8126984126984127,
"grad_norm": 4.205805778503418,
"learning_rate": 0.000145860843067547,
"loss": 3.3195,
"step": 3200
},
{
"epoch": 0.8253968253968254,
"grad_norm": 4.056403160095215,
"learning_rate": 0.00014501438970712714,
"loss": 3.3083,
"step": 3250
},
{
"epoch": 0.8380952380952381,
"grad_norm": 4.160456657409668,
"learning_rate": 0.0001441679363467073,
"loss": 3.2684,
"step": 3300
},
{
"epoch": 0.8507936507936508,
"grad_norm": 4.401111602783203,
"learning_rate": 0.00014332148298628746,
"loss": 3.2735,
"step": 3350
},
{
"epoch": 0.8634920634920635,
"grad_norm": 4.488119602203369,
"learning_rate": 0.00014247502962586763,
"loss": 3.2653,
"step": 3400
},
{
"epoch": 0.8761904761904762,
"grad_norm": 4.22524881362915,
"learning_rate": 0.00014162857626544777,
"loss": 3.2396,
"step": 3450
},
{
"epoch": 0.8888888888888888,
"grad_norm": 4.20457124710083,
"learning_rate": 0.00014078212290502795,
"loss": 3.2357,
"step": 3500
},
{
"epoch": 0.9015873015873016,
"grad_norm": 4.248793601989746,
"learning_rate": 0.00013993566954460812,
"loss": 3.2263,
"step": 3550
},
{
"epoch": 0.9142857142857143,
"grad_norm": 5.042767524719238,
"learning_rate": 0.00013908921618418826,
"loss": 3.2255,
"step": 3600
},
{
"epoch": 0.926984126984127,
"grad_norm": 4.23060941696167,
"learning_rate": 0.00013824276282376844,
"loss": 3.1735,
"step": 3650
},
{
"epoch": 0.9396825396825397,
"grad_norm": 4.368992805480957,
"learning_rate": 0.00013739630946334858,
"loss": 3.1896,
"step": 3700
},
{
"epoch": 0.9523809523809523,
"grad_norm": 4.162256717681885,
"learning_rate": 0.00013654985610292875,
"loss": 3.1839,
"step": 3750
},
{
"epoch": 0.9650793650793651,
"grad_norm": 4.40270471572876,
"learning_rate": 0.0001357034027425089,
"loss": 3.1565,
"step": 3800
},
{
"epoch": 0.9777777777777777,
"grad_norm": 4.529857635498047,
"learning_rate": 0.00013485694938208907,
"loss": 3.1492,
"step": 3850
},
{
"epoch": 0.9904761904761905,
"grad_norm": 5.210158824920654,
"learning_rate": 0.00013402742508887761,
"loss": 3.1567,
"step": 3900
},
{
"epoch": 1.003047619047619,
"grad_norm": 5.2821478843688965,
"learning_rate": 0.00013318097172845776,
"loss": 3.1272,
"step": 3950
},
{
"epoch": 1.0157460317460318,
"grad_norm": 4.609035015106201,
"learning_rate": 0.00013233451836803793,
"loss": 3.1204,
"step": 4000
},
{
"epoch": 1.0284444444444445,
"grad_norm": 5.00595235824585,
"learning_rate": 0.00013148806500761808,
"loss": 3.0907,
"step": 4050
},
{
"epoch": 1.0411428571428571,
"grad_norm": 4.449355125427246,
"learning_rate": 0.00013064161164719825,
"loss": 3.1063,
"step": 4100
},
{
"epoch": 1.0538412698412698,
"grad_norm": 4.445562839508057,
"learning_rate": 0.0001297951582867784,
"loss": 3.097,
"step": 4150
},
{
"epoch": 1.0665396825396825,
"grad_norm": 5.181248664855957,
"learning_rate": 0.00012894870492635856,
"loss": 3.0693,
"step": 4200
},
{
"epoch": 1.0792380952380953,
"grad_norm": 4.498983383178711,
"learning_rate": 0.0001281022515659387,
"loss": 3.072,
"step": 4250
},
{
"epoch": 1.091936507936508,
"grad_norm": 4.873691082000732,
"learning_rate": 0.00012725579820551888,
"loss": 3.0545,
"step": 4300
},
{
"epoch": 1.1046349206349206,
"grad_norm": 4.709224700927734,
"learning_rate": 0.00012640934484509903,
"loss": 3.0424,
"step": 4350
},
{
"epoch": 1.1173333333333333,
"grad_norm": 5.21176815032959,
"learning_rate": 0.0001255628914846792,
"loss": 3.0412,
"step": 4400
},
{
"epoch": 1.130031746031746,
"grad_norm": 4.796786785125732,
"learning_rate": 0.00012471643812425934,
"loss": 3.0254,
"step": 4450
},
{
"epoch": 1.1427301587301588,
"grad_norm": 5.506641864776611,
"learning_rate": 0.00012386998476383951,
"loss": 3.0268,
"step": 4500
},
{
"epoch": 1.1554285714285715,
"grad_norm": 4.79102897644043,
"learning_rate": 0.00012302353140341966,
"loss": 2.9979,
"step": 4550
},
{
"epoch": 1.1681269841269841,
"grad_norm": 5.435400009155273,
"learning_rate": 0.00012217707804299983,
"loss": 2.987,
"step": 4600
},
{
"epoch": 1.1808253968253968,
"grad_norm": 4.705477714538574,
"learning_rate": 0.00012133062468257999,
"loss": 2.9915,
"step": 4650
},
{
"epoch": 1.1935238095238094,
"grad_norm": 4.822847843170166,
"learning_rate": 0.00012048417132216015,
"loss": 2.9762,
"step": 4700
},
{
"epoch": 1.2062222222222223,
"grad_norm": 4.770782947540283,
"learning_rate": 0.0001196377179617403,
"loss": 2.9744,
"step": 4750
},
{
"epoch": 1.218920634920635,
"grad_norm": 5.113085746765137,
"learning_rate": 0.00011879126460132046,
"loss": 2.9761,
"step": 4800
},
{
"epoch": 1.2316190476190476,
"grad_norm": 6.109035491943359,
"learning_rate": 0.00011796174030810902,
"loss": 2.934,
"step": 4850
},
{
"epoch": 1.2443174603174603,
"grad_norm": 5.884860038757324,
"learning_rate": 0.00011711528694768918,
"loss": 2.9369,
"step": 4900
},
{
"epoch": 1.257015873015873,
"grad_norm": 7.224523544311523,
"learning_rate": 0.00011626883358726934,
"loss": 2.9209,
"step": 4950
},
{
"epoch": 1.2697142857142858,
"grad_norm": 5.234792232513428,
"learning_rate": 0.0001154223802268495,
"loss": 2.9331,
"step": 5000
},
{
"epoch": 1.2824126984126984,
"grad_norm": 4.842894554138184,
"learning_rate": 0.00011457592686642966,
"loss": 2.8986,
"step": 5050
},
{
"epoch": 1.295111111111111,
"grad_norm": 4.660989284515381,
"learning_rate": 0.00011372947350600981,
"loss": 2.9141,
"step": 5100
},
{
"epoch": 1.3078095238095238,
"grad_norm": 5.343238830566406,
"learning_rate": 0.00011288302014558999,
"loss": 2.8895,
"step": 5150
},
{
"epoch": 1.3205079365079366,
"grad_norm": 5.187355041503906,
"learning_rate": 0.00011203656678517014,
"loss": 2.892,
"step": 5200
},
{
"epoch": 1.3332063492063493,
"grad_norm": 4.856098175048828,
"learning_rate": 0.0001111901134247503,
"loss": 2.8736,
"step": 5250
},
{
"epoch": 1.345904761904762,
"grad_norm": 4.733485698699951,
"learning_rate": 0.00011034366006433046,
"loss": 2.8567,
"step": 5300
},
{
"epoch": 1.3586031746031746,
"grad_norm": 4.625833034515381,
"learning_rate": 0.00010949720670391062,
"loss": 2.8737,
"step": 5350
},
{
"epoch": 1.3713015873015872,
"grad_norm": 4.856983184814453,
"learning_rate": 0.00010865075334349078,
"loss": 2.8417,
"step": 5400
},
{
"epoch": 1.384,
"grad_norm": 5.2909932136535645,
"learning_rate": 0.00010780429998307094,
"loss": 2.8346,
"step": 5450
},
{
"epoch": 1.3966984126984128,
"grad_norm": 4.909002780914307,
"learning_rate": 0.0001069578466226511,
"loss": 2.8307,
"step": 5500
},
{
"epoch": 1.4093968253968254,
"grad_norm": 4.69639778137207,
"learning_rate": 0.00010611139326223125,
"loss": 2.8138,
"step": 5550
},
{
"epoch": 1.422095238095238,
"grad_norm": 4.822878837585449,
"learning_rate": 0.00010526493990181141,
"loss": 2.8071,
"step": 5600
},
{
"epoch": 1.4347936507936507,
"grad_norm": 5.602210998535156,
"learning_rate": 0.00010441848654139157,
"loss": 2.8128,
"step": 5650
},
{
"epoch": 1.4474920634920636,
"grad_norm": 4.855912208557129,
"learning_rate": 0.00010357203318097173,
"loss": 2.8066,
"step": 5700
},
{
"epoch": 1.4601904761904763,
"grad_norm": 5.553136348724365,
"learning_rate": 0.00010272557982055189,
"loss": 2.7876,
"step": 5750
},
{
"epoch": 1.472888888888889,
"grad_norm": 4.901633262634277,
"learning_rate": 0.00010187912646013204,
"loss": 2.7787,
"step": 5800
},
{
"epoch": 1.4855873015873016,
"grad_norm": 6.1740217208862305,
"learning_rate": 0.0001010326730997122,
"loss": 2.7704,
"step": 5850
},
{
"epoch": 1.4982857142857142,
"grad_norm": 5.393040180206299,
"learning_rate": 0.00010018621973929236,
"loss": 2.7753,
"step": 5900
},
{
"epoch": 1.5109841269841269,
"grad_norm": 6.0959930419921875,
"learning_rate": 9.933976637887253e-05,
"loss": 2.7538,
"step": 5950
},
{
"epoch": 1.5236825396825395,
"grad_norm": 4.659241199493408,
"learning_rate": 9.849331301845269e-05,
"loss": 2.7517,
"step": 6000
},
{
"epoch": 1.5363809523809524,
"grad_norm": 5.5795087814331055,
"learning_rate": 9.766378872524125e-05,
"loss": 2.7409,
"step": 6050
},
{
"epoch": 1.549079365079365,
"grad_norm": 4.83104944229126,
"learning_rate": 9.681733536482141e-05,
"loss": 2.7358,
"step": 6100
},
{
"epoch": 1.561777777777778,
"grad_norm": 5.035250663757324,
"learning_rate": 9.597088200440157e-05,
"loss": 2.7236,
"step": 6150
},
{
"epoch": 1.5744761904761906,
"grad_norm": 5.167687892913818,
"learning_rate": 9.512442864398172e-05,
"loss": 2.7232,
"step": 6200
},
{
"epoch": 1.5871746031746032,
"grad_norm": 5.0377326011657715,
"learning_rate": 9.427797528356188e-05,
"loss": 2.7368,
"step": 6250
},
{
"epoch": 1.599873015873016,
"grad_norm": 4.893152713775635,
"learning_rate": 9.343152192314204e-05,
"loss": 2.6973,
"step": 6300
},
{
"epoch": 1.6125714285714285,
"grad_norm": 5.246462345123291,
"learning_rate": 9.25850685627222e-05,
"loss": 2.6858,
"step": 6350
},
{
"epoch": 1.6252698412698412,
"grad_norm": 5.26235294342041,
"learning_rate": 9.173861520230236e-05,
"loss": 2.6816,
"step": 6400
},
{
"epoch": 1.6379682539682539,
"grad_norm": 4.8995513916015625,
"learning_rate": 9.089216184188252e-05,
"loss": 2.6865,
"step": 6450
},
{
"epoch": 1.6506666666666665,
"grad_norm": 5.598567962646484,
"learning_rate": 9.004570848146267e-05,
"loss": 2.6675,
"step": 6500
},
{
"epoch": 1.6633650793650794,
"grad_norm": 5.423081874847412,
"learning_rate": 8.919925512104283e-05,
"loss": 2.66,
"step": 6550
},
{
"epoch": 1.676063492063492,
"grad_norm": 4.968945026397705,
"learning_rate": 8.835280176062299e-05,
"loss": 2.6629,
"step": 6600
},
{
"epoch": 1.688761904761905,
"grad_norm": 6.054278373718262,
"learning_rate": 8.750634840020315e-05,
"loss": 2.6784,
"step": 6650
},
{
"epoch": 1.7014603174603176,
"grad_norm": 5.279598712921143,
"learning_rate": 8.665989503978331e-05,
"loss": 2.6327,
"step": 6700
},
{
"epoch": 1.7141587301587302,
"grad_norm": 5.150700092315674,
"learning_rate": 8.581344167936347e-05,
"loss": 2.6394,
"step": 6750
},
{
"epoch": 1.7268571428571429,
"grad_norm": 5.459251403808594,
"learning_rate": 8.496698831894362e-05,
"loss": 2.644,
"step": 6800
},
{
"epoch": 1.7395555555555555,
"grad_norm": 5.293938159942627,
"learning_rate": 8.413746402573218e-05,
"loss": 2.646,
"step": 6850
},
{
"epoch": 1.7522539682539682,
"grad_norm": 5.72529411315918,
"learning_rate": 8.329101066531234e-05,
"loss": 2.624,
"step": 6900
},
{
"epoch": 1.7649523809523808,
"grad_norm": 5.739988327026367,
"learning_rate": 8.24445573048925e-05,
"loss": 2.6175,
"step": 6950
},
{
"epoch": 1.7776507936507937,
"grad_norm": 5.638957500457764,
"learning_rate": 8.159810394447266e-05,
"loss": 2.6236,
"step": 7000
},
{
"epoch": 1.7903492063492064,
"grad_norm": 5.7885026931762695,
"learning_rate": 8.075165058405282e-05,
"loss": 2.5973,
"step": 7050
},
{
"epoch": 1.803047619047619,
"grad_norm": 5.244924545288086,
"learning_rate": 7.990519722363298e-05,
"loss": 2.6031,
"step": 7100
},
{
"epoch": 1.8157460317460319,
"grad_norm": 9.06462287902832,
"learning_rate": 7.905874386321313e-05,
"loss": 2.5768,
"step": 7150
},
{
"epoch": 1.8284444444444445,
"grad_norm": 5.335842132568359,
"learning_rate": 7.821229050279329e-05,
"loss": 2.581,
"step": 7200
},
{
"epoch": 1.8411428571428572,
"grad_norm": 5.182567596435547,
"learning_rate": 7.736583714237345e-05,
"loss": 2.5754,
"step": 7250
},
{
"epoch": 1.8538412698412698,
"grad_norm": 5.487778186798096,
"learning_rate": 7.651938378195361e-05,
"loss": 2.5688,
"step": 7300
},
{
"epoch": 1.8665396825396825,
"grad_norm": 5.46382474899292,
"learning_rate": 7.567293042153377e-05,
"loss": 2.5632,
"step": 7350
},
{
"epoch": 1.8792380952380952,
"grad_norm": 5.17083740234375,
"learning_rate": 7.482647706111392e-05,
"loss": 2.55,
"step": 7400
},
{
"epoch": 1.8919365079365078,
"grad_norm": 5.455732345581055,
"learning_rate": 7.398002370069408e-05,
"loss": 2.5509,
"step": 7450
},
{
"epoch": 1.9046349206349207,
"grad_norm": 5.0072503089904785,
"learning_rate": 7.313357034027426e-05,
"loss": 2.5436,
"step": 7500
},
{
"epoch": 1.9173333333333333,
"grad_norm": 5.0145087242126465,
"learning_rate": 7.228711697985441e-05,
"loss": 2.5416,
"step": 7550
},
{
"epoch": 1.930031746031746,
"grad_norm": 5.2530364990234375,
"learning_rate": 7.144066361943457e-05,
"loss": 2.5234,
"step": 7600
},
{
"epoch": 1.9427301587301589,
"grad_norm": 4.886019229888916,
"learning_rate": 7.059421025901474e-05,
"loss": 2.5447,
"step": 7650
},
{
"epoch": 1.9554285714285715,
"grad_norm": 5.070368766784668,
"learning_rate": 6.97477568985949e-05,
"loss": 2.5308,
"step": 7700
},
{
"epoch": 1.9681269841269842,
"grad_norm": 5.158459186553955,
"learning_rate": 6.890130353817506e-05,
"loss": 2.5265,
"step": 7750
},
{
"epoch": 1.9808253968253968,
"grad_norm": 5.249716281890869,
"learning_rate": 6.805485017775522e-05,
"loss": 2.5079,
"step": 7800
},
{
"epoch": 1.9935238095238095,
"grad_norm": 5.3184590339660645,
"learning_rate": 6.720839681733538e-05,
"loss": 2.5135,
"step": 7850
},
{
"epoch": 2.006095238095238,
"grad_norm": 4.898834705352783,
"learning_rate": 6.636194345691553e-05,
"loss": 2.487,
"step": 7900
},
{
"epoch": 2.0187936507936506,
"grad_norm": 4.718045234680176,
"learning_rate": 6.55154900964957e-05,
"loss": 2.504,
"step": 7950
},
{
"epoch": 2.0314920634920637,
"grad_norm": 4.977246284484863,
"learning_rate": 6.466903673607585e-05,
"loss": 2.4924,
"step": 8000
},
{
"epoch": 2.0441904761904763,
"grad_norm": 5.398455619812012,
"learning_rate": 6.382258337565601e-05,
"loss": 2.4839,
"step": 8050
},
{
"epoch": 2.056888888888889,
"grad_norm": 6.387637138366699,
"learning_rate": 6.297613001523617e-05,
"loss": 2.4833,
"step": 8100
},
{
"epoch": 2.0695873015873016,
"grad_norm": 5.588785648345947,
"learning_rate": 6.212967665481633e-05,
"loss": 2.4569,
"step": 8150
},
{
"epoch": 2.0822857142857143,
"grad_norm": 6.301563262939453,
"learning_rate": 6.128322329439648e-05,
"loss": 2.4748,
"step": 8200
},
{
"epoch": 2.094984126984127,
"grad_norm": 5.7610979080200195,
"learning_rate": 6.043676993397664e-05,
"loss": 2.4597,
"step": 8250
},
{
"epoch": 2.1076825396825396,
"grad_norm": 5.260268211364746,
"learning_rate": 5.95903165735568e-05,
"loss": 2.4581,
"step": 8300
},
{
"epoch": 2.1203809523809523,
"grad_norm": 5.712375640869141,
"learning_rate": 5.874386321313696e-05,
"loss": 2.4522,
"step": 8350
},
{
"epoch": 2.133079365079365,
"grad_norm": 6.139365196228027,
"learning_rate": 5.7897409852717125e-05,
"loss": 2.4588,
"step": 8400
},
{
"epoch": 2.145777777777778,
"grad_norm": 5.294638633728027,
"learning_rate": 5.705095649229728e-05,
"loss": 2.4467,
"step": 8450
},
{
"epoch": 2.1584761904761907,
"grad_norm": 6.786096572875977,
"learning_rate": 5.620450313187744e-05,
"loss": 2.4538,
"step": 8500
},
{
"epoch": 2.1711746031746033,
"grad_norm": 4.95149564743042,
"learning_rate": 5.53580497714576e-05,
"loss": 2.4478,
"step": 8550
},
{
"epoch": 2.183873015873016,
"grad_norm": 4.965189456939697,
"learning_rate": 5.451159641103776e-05,
"loss": 2.4449,
"step": 8600
},
{
"epoch": 2.1965714285714286,
"grad_norm": 5.404216766357422,
"learning_rate": 5.3665143050617916e-05,
"loss": 2.4447,
"step": 8650
},
{
"epoch": 2.2092698412698413,
"grad_norm": 5.965044021606445,
"learning_rate": 5.2818689690198075e-05,
"loss": 2.4337,
"step": 8700
},
{
"epoch": 2.221968253968254,
"grad_norm": 6.933143615722656,
"learning_rate": 5.197223632977823e-05,
"loss": 2.4105,
"step": 8750
},
{
"epoch": 2.2346666666666666,
"grad_norm": 4.795731067657471,
"learning_rate": 5.112578296935839e-05,
"loss": 2.4192,
"step": 8800
},
{
"epoch": 2.2473650793650792,
"grad_norm": 5.221839427947998,
"learning_rate": 5.027932960893855e-05,
"loss": 2.4232,
"step": 8850
},
{
"epoch": 2.260063492063492,
"grad_norm": 5.512608528137207,
"learning_rate": 4.9432876248518715e-05,
"loss": 2.414,
"step": 8900
},
{
"epoch": 2.2727619047619045,
"grad_norm": 5.804533958435059,
"learning_rate": 4.858642288809887e-05,
"loss": 2.4088,
"step": 8950
},
{
"epoch": 2.2854603174603176,
"grad_norm": 5.301205635070801,
"learning_rate": 4.773996952767903e-05,
"loss": 2.4074,
"step": 9000
},
{
"epoch": 2.2981587301587303,
"grad_norm": 5.4529290199279785,
"learning_rate": 4.689351616725919e-05,
"loss": 2.3906,
"step": 9050
},
{
"epoch": 2.310857142857143,
"grad_norm": 5.1715006828308105,
"learning_rate": 4.604706280683935e-05,
"loss": 2.4082,
"step": 9100
},
{
"epoch": 2.3235555555555556,
"grad_norm": 5.739888668060303,
"learning_rate": 4.5200609446419506e-05,
"loss": 2.3982,
"step": 9150
},
{
"epoch": 2.3362539682539682,
"grad_norm": 5.454600811004639,
"learning_rate": 4.4354156085999664e-05,
"loss": 2.3947,
"step": 9200
},
{
"epoch": 2.348952380952381,
"grad_norm": 5.671194553375244,
"learning_rate": 4.350770272557982e-05,
"loss": 2.3984,
"step": 9250
},
{
"epoch": 2.3616507936507936,
"grad_norm": 5.695377349853516,
"learning_rate": 4.266124936515998e-05,
"loss": 2.3732,
"step": 9300
},
{
"epoch": 2.374349206349206,
"grad_norm": 5.454712390899658,
"learning_rate": 4.181479600474014e-05,
"loss": 2.3769,
"step": 9350
},
{
"epoch": 2.387047619047619,
"grad_norm": 5.6516499519348145,
"learning_rate": 4.0968342644320304e-05,
"loss": 2.3788,
"step": 9400
},
{
"epoch": 2.399746031746032,
"grad_norm": 5.617581367492676,
"learning_rate": 4.012188928390046e-05,
"loss": 2.3754,
"step": 9450
},
{
"epoch": 2.4124444444444446,
"grad_norm": 5.995534420013428,
"learning_rate": 3.927543592348062e-05,
"loss": 2.3647,
"step": 9500
},
{
"epoch": 2.4251428571428573,
"grad_norm": 4.861730098724365,
"learning_rate": 3.842898256306078e-05,
"loss": 2.3699,
"step": 9550
},
{
"epoch": 2.43784126984127,
"grad_norm": 5.109068393707275,
"learning_rate": 3.758252920264094e-05,
"loss": 2.3709,
"step": 9600
},
{
"epoch": 2.4505396825396826,
"grad_norm": 5.989678382873535,
"learning_rate": 3.6736075842221096e-05,
"loss": 2.3442,
"step": 9650
},
{
"epoch": 2.4632380952380952,
"grad_norm": 5.233463764190674,
"learning_rate": 3.5889622481801254e-05,
"loss": 2.3567,
"step": 9700
},
{
"epoch": 2.475936507936508,
"grad_norm": 4.930139541625977,
"learning_rate": 3.504316912138141e-05,
"loss": 2.3334,
"step": 9750
},
{
"epoch": 2.4886349206349205,
"grad_norm": 5.235612392425537,
"learning_rate": 3.419671576096157e-05,
"loss": 2.3523,
"step": 9800
},
{
"epoch": 2.501333333333333,
"grad_norm": 7.875730991363525,
"learning_rate": 3.335026240054173e-05,
"loss": 2.3508,
"step": 9850
},
{
"epoch": 2.514031746031746,
"grad_norm": 5.479938507080078,
"learning_rate": 3.2503809040121894e-05,
"loss": 2.3579,
"step": 9900
},
{
"epoch": 2.5267301587301585,
"grad_norm": 4.948204040527344,
"learning_rate": 3.165735567970205e-05,
"loss": 2.3475,
"step": 9950
},
{
"epoch": 2.5394285714285716,
"grad_norm": 5.541229724884033,
"learning_rate": 3.081090231928221e-05,
"loss": 2.3276,
"step": 10000
},
{
"epoch": 2.5521269841269842,
"grad_norm": 5.418817043304443,
"learning_rate": 2.996444895886237e-05,
"loss": 2.3433,
"step": 10050
},
{
"epoch": 2.564825396825397,
"grad_norm": 7.228455543518066,
"learning_rate": 2.9117995598442527e-05,
"loss": 2.3274,
"step": 10100
},
{
"epoch": 2.5775238095238096,
"grad_norm": 5.323376655578613,
"learning_rate": 2.8271542238022686e-05,
"loss": 2.3404,
"step": 10150
},
{
"epoch": 2.590222222222222,
"grad_norm": 5.080998420715332,
"learning_rate": 2.7425088877602844e-05,
"loss": 2.3164,
"step": 10200
},
{
"epoch": 2.602920634920635,
"grad_norm": 5.0400285720825195,
"learning_rate": 2.6578635517183002e-05,
"loss": 2.3301,
"step": 10250
},
{
"epoch": 2.6156190476190475,
"grad_norm": 5.519168853759766,
"learning_rate": 2.5732182156763164e-05,
"loss": 2.3206,
"step": 10300
},
{
"epoch": 2.62831746031746,
"grad_norm": 5.184562683105469,
"learning_rate": 2.4885728796343322e-05,
"loss": 2.3197,
"step": 10350
},
{
"epoch": 2.6410158730158733,
"grad_norm": 5.173785209655762,
"learning_rate": 2.403927543592348e-05,
"loss": 2.3176,
"step": 10400
},
{
"epoch": 2.653714285714286,
"grad_norm": 5.67647647857666,
"learning_rate": 2.319282207550364e-05,
"loss": 2.3154,
"step": 10450
},
{
"epoch": 2.6664126984126986,
"grad_norm": 6.398087978363037,
"learning_rate": 2.2346368715083797e-05,
"loss": 2.3164,
"step": 10500
},
{
"epoch": 2.679111111111111,
"grad_norm": 5.975333213806152,
"learning_rate": 2.149991535466396e-05,
"loss": 2.314,
"step": 10550
},
{
"epoch": 2.691809523809524,
"grad_norm": 5.434169292449951,
"learning_rate": 2.065346199424412e-05,
"loss": 2.3034,
"step": 10600
},
{
"epoch": 2.7045079365079365,
"grad_norm": 5.366811275482178,
"learning_rate": 1.980700863382428e-05,
"loss": 2.3161,
"step": 10650
},
{
"epoch": 2.717206349206349,
"grad_norm": 6.124394416809082,
"learning_rate": 1.8960555273404437e-05,
"loss": 2.3031,
"step": 10700
},
{
"epoch": 2.729904761904762,
"grad_norm": 6.769460201263428,
"learning_rate": 1.8114101912984595e-05,
"loss": 2.2893,
"step": 10750
},
{
"epoch": 2.7426031746031745,
"grad_norm": 4.68062162399292,
"learning_rate": 1.7267648552564754e-05,
"loss": 2.3194,
"step": 10800
},
{
"epoch": 2.755301587301587,
"grad_norm": 5.621355056762695,
"learning_rate": 1.6421195192144915e-05,
"loss": 2.3095,
"step": 10850
},
{
"epoch": 2.768,
"grad_norm": 5.693627834320068,
"learning_rate": 1.5574741831725074e-05,
"loss": 2.3029,
"step": 10900
},
{
"epoch": 2.7806984126984124,
"grad_norm": 5.223197937011719,
"learning_rate": 1.4728288471305232e-05,
"loss": 2.2965,
"step": 10950
},
{
"epoch": 2.7933968253968255,
"grad_norm": 5.495180606842041,
"learning_rate": 1.388183511088539e-05,
"loss": 2.3037,
"step": 11000
},
{
"epoch": 2.806095238095238,
"grad_norm": 5.025885105133057,
"learning_rate": 1.303538175046555e-05,
"loss": 2.2896,
"step": 11050
},
{
"epoch": 2.818793650793651,
"grad_norm": 5.007611274719238,
"learning_rate": 1.2188928390045709e-05,
"loss": 2.2878,
"step": 11100
},
{
"epoch": 2.8314920634920635,
"grad_norm": 4.786118984222412,
"learning_rate": 1.1342475029625869e-05,
"loss": 2.2823,
"step": 11150
},
{
"epoch": 2.844190476190476,
"grad_norm": 5.517834663391113,
"learning_rate": 1.0496021669206027e-05,
"loss": 2.2796,
"step": 11200
},
{
"epoch": 2.856888888888889,
"grad_norm": 7.0708818435668945,
"learning_rate": 9.649568308786185e-06,
"loss": 2.2883,
"step": 11250
},
{
"epoch": 2.8695873015873015,
"grad_norm": 5.164283275604248,
"learning_rate": 8.803114948366345e-06,
"loss": 2.2921,
"step": 11300
},
{
"epoch": 2.8822857142857146,
"grad_norm": 5.082614421844482,
"learning_rate": 7.956661587946503e-06,
"loss": 2.2956,
"step": 11350
},
{
"epoch": 2.894984126984127,
"grad_norm": 5.358335018157959,
"learning_rate": 7.110208227526663e-06,
"loss": 2.29,
"step": 11400
},
{
"epoch": 2.90768253968254,
"grad_norm": 4.937663555145264,
"learning_rate": 6.2637548671068235e-06,
"loss": 2.2912,
"step": 11450
},
{
"epoch": 2.9203809523809525,
"grad_norm": 4.954619407653809,
"learning_rate": 5.417301506686982e-06,
"loss": 2.2946,
"step": 11500
},
{
"epoch": 2.933079365079365,
"grad_norm": 5.406091690063477,
"learning_rate": 4.570848146267141e-06,
"loss": 2.2903,
"step": 11550
},
{
"epoch": 2.945777777777778,
"grad_norm": 5.957233428955078,
"learning_rate": 3.7243947858473e-06,
"loss": 2.2808,
"step": 11600
},
{
"epoch": 2.9584761904761905,
"grad_norm": 5.3814215660095215,
"learning_rate": 2.8779414254274592e-06,
"loss": 2.2918,
"step": 11650
},
{
"epoch": 2.971174603174603,
"grad_norm": 7.456835746765137,
"learning_rate": 2.0314880650076184e-06,
"loss": 2.2871,
"step": 11700
},
{
"epoch": 2.983873015873016,
"grad_norm": 5.577419757843018,
"learning_rate": 1.1850347045877773e-06,
"loss": 2.2791,
"step": 11750
},
{
"epoch": 2.9965714285714284,
"grad_norm": 5.656188488006592,
"learning_rate": 3.3858134416793636e-07,
"loss": 2.2811,
"step": 11800
},
{
"epoch": 3.0,
"step": 11814,
"total_flos": 5952688007980032.0,
"train_loss": 3.0341684326254534,
"train_runtime": 5229.0251,
"train_samples_per_second": 72.289,
"train_steps_per_second": 2.259
}
],
"logging_steps": 50,
"max_steps": 11814,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5952688007980032.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}