Upload 16 files

b9cd251 verified 4 months ago

41.9 kB

Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 3.0,
	"eval_steps": 500,
	"global_step": 11814,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.012698412698412698,
	"grad_norm": 0.18539635837078094,
	"learning_rate": 0.00019917047570678858,
	"loss": 5.4804,
	"step": 50
	},
	{
	"epoch": 0.025396825396825397,
	"grad_norm": 0.19653430581092834,
	"learning_rate": 0.00019832402234636873,
	"loss": 5.4815,
	"step": 100
	},
	{
	"epoch": 0.0380952380952381,
	"grad_norm": 0.21583063900470734,
	"learning_rate": 0.0001974775689859489,
	"loss": 5.4752,
	"step": 150
	},
	{
	"epoch": 0.050793650793650794,
	"grad_norm": 0.659081757068634,
	"learning_rate": 0.00019663111562552904,
	"loss": 5.3388,
	"step": 200
	},
	{
	"epoch": 0.06349206349206349,
	"grad_norm": 1.1028743982315063,
	"learning_rate": 0.00019578466226510922,
	"loss": 5.0174,
	"step": 250
	},
	{
	"epoch": 0.0761904761904762,
	"grad_norm": 1.1611863374710083,
	"learning_rate": 0.00019493820890468936,
	"loss": 4.9543,
	"step": 300
	},
	{
	"epoch": 0.08888888888888889,
	"grad_norm": 1.3357131481170654,
	"learning_rate": 0.00019409175554426953,
	"loss": 4.8991,
	"step": 350
	},
	{
	"epoch": 0.10158730158730159,
	"grad_norm": 1.320380687713623,
	"learning_rate": 0.00019324530218384968,
	"loss": 4.889,
	"step": 400
	},
	{
	"epoch": 0.11428571428571428,
	"grad_norm": 1.5646241903305054,
	"learning_rate": 0.00019239884882342985,
	"loss": 4.8641,
	"step": 450
	},
	{
	"epoch": 0.12698412698412698,
	"grad_norm": 1.2670501470565796,
	"learning_rate": 0.00019155239546301,
	"loss": 4.8605,
	"step": 500
	},
	{
	"epoch": 0.13968253968253969,
	"grad_norm": 1.3732168674468994,
	"learning_rate": 0.00019070594210259017,
	"loss": 4.8206,
	"step": 550
	},
	{
	"epoch": 0.1523809523809524,
	"grad_norm": 1.7774670124053955,
	"learning_rate": 0.0001898594887421703,
	"loss": 4.8106,
	"step": 600
	},
	{
	"epoch": 0.16507936507936508,
	"grad_norm": 1.8740317821502686,
	"learning_rate": 0.00018901303538175048,
	"loss": 4.8007,
	"step": 650
	},
	{
	"epoch": 0.17777777777777778,
	"grad_norm": 1.6368571519851685,
	"learning_rate": 0.00018816658202133063,
	"loss": 4.7815,
	"step": 700
	},
	{
	"epoch": 0.19047619047619047,
	"grad_norm": 1.5557011365890503,
	"learning_rate": 0.0001873201286609108,
	"loss": 4.7716,
	"step": 750
	},
	{
	"epoch": 0.20317460317460317,
	"grad_norm": 1.6413705348968506,
	"learning_rate": 0.00018647367530049097,
	"loss": 4.767,
	"step": 800
	},
	{
	"epoch": 0.21587301587301588,
	"grad_norm": 1.6060304641723633,
	"learning_rate": 0.00018562722194007112,
	"loss": 4.7457,
	"step": 850
	},
	{
	"epoch": 0.22857142857142856,
	"grad_norm": 1.832118034362793,
	"learning_rate": 0.0001847807685796513,
	"loss": 4.739,
	"step": 900
	},
	{
	"epoch": 0.24126984126984127,
	"grad_norm": 1.7070050239562988,
	"learning_rate": 0.00018393431521923143,
	"loss": 4.7319,
	"step": 950
	},
	{
	"epoch": 0.25396825396825395,
	"grad_norm": 1.768761157989502,
	"learning_rate": 0.0001830878618588116,
	"loss": 4.7296,
	"step": 1000
	},
	{
	"epoch": 0.26666666666666666,
	"grad_norm": 1.7824410200119019,
	"learning_rate": 0.00018224140849839175,
	"loss": 4.7069,
	"step": 1050
	},
	{
	"epoch": 0.27936507936507937,
	"grad_norm": 1.8590071201324463,
	"learning_rate": 0.00018139495513797192,
	"loss": 4.6814,
	"step": 1100
	},
	{
	"epoch": 0.2920634920634921,
	"grad_norm": 1.8381578922271729,
	"learning_rate": 0.00018054850177755206,
	"loss": 4.6146,
	"step": 1150
	},
	{
	"epoch": 0.3047619047619048,
	"grad_norm": 1.8397525548934937,
	"learning_rate": 0.00017970204841713224,
	"loss": 4.5086,
	"step": 1200
	},
	{
	"epoch": 0.31746031746031744,
	"grad_norm": 1.8704237937927246,
	"learning_rate": 0.00017885559505671238,
	"loss": 4.376,
	"step": 1250
	},
	{
	"epoch": 0.33015873015873015,
	"grad_norm": 2.0715010166168213,
	"learning_rate": 0.00017800914169629255,
	"loss": 4.2503,
	"step": 1300
	},
	{
	"epoch": 0.34285714285714286,
	"grad_norm": 3.124469041824341,
	"learning_rate": 0.0001771626883358727,
	"loss": 4.157,
	"step": 1350
	},
	{
	"epoch": 0.35555555555555557,
	"grad_norm": 2.0155253410339355,
	"learning_rate": 0.00017631623497545287,
	"loss": 4.1085,
	"step": 1400
	},
	{
	"epoch": 0.3682539682539683,
	"grad_norm": 2.309039354324341,
	"learning_rate": 0.00017546978161503301,
	"loss": 4.043,
	"step": 1450
	},
	{
	"epoch": 0.38095238095238093,
	"grad_norm": 2.6701300144195557,
	"learning_rate": 0.0001746233282546132,
	"loss": 4.016,
	"step": 1500
	},
	{
	"epoch": 0.39365079365079364,
	"grad_norm": NaN,
	"learning_rate": 0.00017377687489419333,
	"loss": 3.9352,
	"step": 1550
	},
	{
	"epoch": 0.40634920634920635,
	"grad_norm": 2.3252177238464355,
	"learning_rate": 0.00017294735060098188,
	"loss": 3.9148,
	"step": 1600
	},
	{
	"epoch": 0.41904761904761906,
	"grad_norm": 2.336137533187866,
	"learning_rate": 0.00017210089724056205,
	"loss": 3.9097,
	"step": 1650
	},
	{
	"epoch": 0.43174603174603177,
	"grad_norm": 3.3141894340515137,
	"learning_rate": 0.0001712544438801422,
	"loss": 3.8494,
	"step": 1700
	},
	{
	"epoch": 0.4444444444444444,
	"grad_norm": 2.5842506885528564,
	"learning_rate": 0.0001704079905197224,
	"loss": 3.8192,
	"step": 1750
	},
	{
	"epoch": 0.45714285714285713,
	"grad_norm": 2.667503833770752,
	"learning_rate": 0.00016956153715930254,
	"loss": 3.835,
	"step": 1800
	},
	{
	"epoch": 0.46984126984126984,
	"grad_norm": 2.7759885787963867,
	"learning_rate": 0.0001687150837988827,
	"loss": 3.7916,
	"step": 1850
	},
	{
	"epoch": 0.48253968253968255,
	"grad_norm": 2.939075231552124,
	"learning_rate": 0.00016786863043846285,
	"loss": 3.7805,
	"step": 1900
	},
	{
	"epoch": 0.49523809523809526,
	"grad_norm": 3.0116183757781982,
	"learning_rate": 0.00016702217707804303,
	"loss": 3.7436,
	"step": 1950
	},
	{
	"epoch": 0.5079365079365079,
	"grad_norm": 2.9771008491516113,
	"learning_rate": 0.00016617572371762317,
	"loss": 3.7206,
	"step": 2000
	},
	{
	"epoch": 0.5206349206349207,
	"grad_norm": 3.095003843307495,
	"learning_rate": 0.00016532927035720334,
	"loss": 3.7194,
	"step": 2050
	},
	{
	"epoch": 0.5333333333333333,
	"grad_norm": 3.0788674354553223,
	"learning_rate": 0.0001644828169967835,
	"loss": 3.6632,
	"step": 2100
	},
	{
	"epoch": 0.546031746031746,
	"grad_norm": 3.467590093612671,
	"learning_rate": 0.00016363636363636366,
	"loss": 3.6919,
	"step": 2150
	},
	{
	"epoch": 0.5587301587301587,
	"grad_norm": 3.4599721431732178,
	"learning_rate": 0.0001627899102759438,
	"loss": 3.6657,
	"step": 2200
	},
	{
	"epoch": 0.5714285714285714,
	"grad_norm": 3.194898843765259,
	"learning_rate": 0.00016194345691552398,
	"loss": 3.6454,
	"step": 2250
	},
	{
	"epoch": 0.5841269841269842,
	"grad_norm": 3.5158746242523193,
	"learning_rate": 0.00016109700355510412,
	"loss": 3.6003,
	"step": 2300
	},
	{
	"epoch": 0.5968253968253968,
	"grad_norm": 3.2812819480895996,
	"learning_rate": 0.0001602505501946843,
	"loss": 3.5907,
	"step": 2350
	},
	{
	"epoch": 0.6095238095238096,
	"grad_norm": 3.5167620182037354,
	"learning_rate": 0.00015940409683426444,
	"loss": 3.5821,
	"step": 2400
	},
	{
	"epoch": 0.6222222222222222,
	"grad_norm": 3.5780296325683594,
	"learning_rate": 0.0001585576434738446,
	"loss": 3.5245,
	"step": 2450
	},
	{
	"epoch": 0.6349206349206349,
	"grad_norm": 4.31138277053833,
	"learning_rate": 0.00015771119011342478,
	"loss": 3.5197,
	"step": 2500
	},
	{
	"epoch": 0.6476190476190476,
	"grad_norm": 3.752105236053467,
	"learning_rate": 0.00015686473675300493,
	"loss": 3.5104,
	"step": 2550
	},
	{
	"epoch": 0.6603174603174603,
	"grad_norm": 3.916947603225708,
	"learning_rate": 0.0001560182833925851,
	"loss": 3.5123,
	"step": 2600
	},
	{
	"epoch": 0.6730158730158731,
	"grad_norm": 3.912555694580078,
	"learning_rate": 0.00015517183003216524,
	"loss": 3.4689,
	"step": 2650
	},
	{
	"epoch": 0.6857142857142857,
	"grad_norm": 4.15084981918335,
	"learning_rate": 0.00015432537667174541,
	"loss": 3.4572,
	"step": 2700
	},
	{
	"epoch": 0.6984126984126984,
	"grad_norm": 4.125711441040039,
	"learning_rate": 0.00015347892331132556,
	"loss": 3.4323,
	"step": 2750
	},
	{
	"epoch": 0.7111111111111111,
	"grad_norm": 3.8961021900177,
	"learning_rate": 0.00015263246995090573,
	"loss": 3.4215,
	"step": 2800
	},
	{
	"epoch": 0.7238095238095238,
	"grad_norm": 4.138737678527832,
	"learning_rate": 0.00015178601659048588,
	"loss": 3.4113,
	"step": 2850
	},
	{
	"epoch": 0.7365079365079366,
	"grad_norm": 3.9136457443237305,
	"learning_rate": 0.00015093956323006605,
	"loss": 3.4036,
	"step": 2900
	},
	{
	"epoch": 0.7492063492063492,
	"grad_norm": 4.833474636077881,
	"learning_rate": 0.0001500931098696462,
	"loss": 3.3733,
	"step": 2950
	},
	{
	"epoch": 0.7619047619047619,
	"grad_norm": 3.974536657333374,
	"learning_rate": 0.00014924665650922636,
	"loss": 3.3637,
	"step": 3000
	},
	{
	"epoch": 0.7746031746031746,
	"grad_norm": 3.9956250190734863,
	"learning_rate": 0.0001484002031488065,
	"loss": 3.3421,
	"step": 3050
	},
	{
	"epoch": 0.7873015873015873,
	"grad_norm": 4.232203483581543,
	"learning_rate": 0.00014755374978838668,
	"loss": 3.3564,
	"step": 3100
	},
	{
	"epoch": 0.8,
	"grad_norm": 4.4057207107543945,
	"learning_rate": 0.00014670729642796683,
	"loss": 3.3156,
	"step": 3150
	},
	{
	"epoch": 0.8126984126984127,
	"grad_norm": 4.205805778503418,
	"learning_rate": 0.000145860843067547,
	"loss": 3.3195,
	"step": 3200
	},
	{
	"epoch": 0.8253968253968254,
	"grad_norm": 4.056403160095215,
	"learning_rate": 0.00014501438970712714,
	"loss": 3.3083,
	"step": 3250
	},
	{
	"epoch": 0.8380952380952381,
	"grad_norm": 4.160456657409668,
	"learning_rate": 0.0001441679363467073,
	"loss": 3.2684,
	"step": 3300
	},
	{
	"epoch": 0.8507936507936508,
	"grad_norm": 4.401111602783203,
	"learning_rate": 0.00014332148298628746,
	"loss": 3.2735,
	"step": 3350
	},
	{
	"epoch": 0.8634920634920635,
	"grad_norm": 4.488119602203369,
	"learning_rate": 0.00014247502962586763,
	"loss": 3.2653,
	"step": 3400
	},
	{
	"epoch": 0.8761904761904762,
	"grad_norm": 4.22524881362915,
	"learning_rate": 0.00014162857626544777,
	"loss": 3.2396,
	"step": 3450
	},
	{
	"epoch": 0.8888888888888888,
	"grad_norm": 4.20457124710083,
	"learning_rate": 0.00014078212290502795,
	"loss": 3.2357,
	"step": 3500
	},
	{
	"epoch": 0.9015873015873016,
	"grad_norm": 4.248793601989746,
	"learning_rate": 0.00013993566954460812,
	"loss": 3.2263,
	"step": 3550
	},
	{
	"epoch": 0.9142857142857143,
	"grad_norm": 5.042767524719238,
	"learning_rate": 0.00013908921618418826,
	"loss": 3.2255,
	"step": 3600
	},
	{
	"epoch": 0.926984126984127,
	"grad_norm": 4.23060941696167,
	"learning_rate": 0.00013824276282376844,
	"loss": 3.1735,
	"step": 3650
	},
	{
	"epoch": 0.9396825396825397,
	"grad_norm": 4.368992805480957,
	"learning_rate": 0.00013739630946334858,
	"loss": 3.1896,
	"step": 3700
	},
	{
	"epoch": 0.9523809523809523,
	"grad_norm": 4.162256717681885,
	"learning_rate": 0.00013654985610292875,
	"loss": 3.1839,
	"step": 3750
	},
	{
	"epoch": 0.9650793650793651,
	"grad_norm": 4.40270471572876,
	"learning_rate": 0.0001357034027425089,
	"loss": 3.1565,
	"step": 3800
	},
	{
	"epoch": 0.9777777777777777,
	"grad_norm": 4.529857635498047,
	"learning_rate": 0.00013485694938208907,
	"loss": 3.1492,
	"step": 3850
	},
	{
	"epoch": 0.9904761904761905,
	"grad_norm": 5.210158824920654,
	"learning_rate": 0.00013402742508887761,
	"loss": 3.1567,
	"step": 3900
	},
	{
	"epoch": 1.003047619047619,
	"grad_norm": 5.2821478843688965,
	"learning_rate": 0.00013318097172845776,
	"loss": 3.1272,
	"step": 3950
	},
	{
	"epoch": 1.0157460317460318,
	"grad_norm": 4.609035015106201,
	"learning_rate": 0.00013233451836803793,
	"loss": 3.1204,
	"step": 4000
	},
	{
	"epoch": 1.0284444444444445,
	"grad_norm": 5.00595235824585,
	"learning_rate": 0.00013148806500761808,
	"loss": 3.0907,
	"step": 4050
	},
	{
	"epoch": 1.0411428571428571,
	"grad_norm": 4.449355125427246,
	"learning_rate": 0.00013064161164719825,
	"loss": 3.1063,
	"step": 4100
	},
	{
	"epoch": 1.0538412698412698,
	"grad_norm": 4.445562839508057,
	"learning_rate": 0.0001297951582867784,
	"loss": 3.097,
	"step": 4150
	},
	{
	"epoch": 1.0665396825396825,
	"grad_norm": 5.181248664855957,
	"learning_rate": 0.00012894870492635856,
	"loss": 3.0693,
	"step": 4200
	},
	{
	"epoch": 1.0792380952380953,
	"grad_norm": 4.498983383178711,
	"learning_rate": 0.0001281022515659387,
	"loss": 3.072,
	"step": 4250
	},
	{
	"epoch": 1.091936507936508,
	"grad_norm": 4.873691082000732,
	"learning_rate": 0.00012725579820551888,
	"loss": 3.0545,
	"step": 4300
	},
	{
	"epoch": 1.1046349206349206,
	"grad_norm": 4.709224700927734,
	"learning_rate": 0.00012640934484509903,
	"loss": 3.0424,
	"step": 4350
	},
	{
	"epoch": 1.1173333333333333,
	"grad_norm": 5.21176815032959,
	"learning_rate": 0.0001255628914846792,
	"loss": 3.0412,
	"step": 4400
	},
	{
	"epoch": 1.130031746031746,
	"grad_norm": 4.796786785125732,
	"learning_rate": 0.00012471643812425934,
	"loss": 3.0254,
	"step": 4450
	},
	{
	"epoch": 1.1427301587301588,
	"grad_norm": 5.506641864776611,
	"learning_rate": 0.00012386998476383951,
	"loss": 3.0268,
	"step": 4500
	},
	{
	"epoch": 1.1554285714285715,
	"grad_norm": 4.79102897644043,
	"learning_rate": 0.00012302353140341966,
	"loss": 2.9979,
	"step": 4550
	},
	{
	"epoch": 1.1681269841269841,
	"grad_norm": 5.435400009155273,
	"learning_rate": 0.00012217707804299983,
	"loss": 2.987,
	"step": 4600
	},
	{
	"epoch": 1.1808253968253968,
	"grad_norm": 4.705477714538574,
	"learning_rate": 0.00012133062468257999,
	"loss": 2.9915,
	"step": 4650
	},
	{
	"epoch": 1.1935238095238094,
	"grad_norm": 4.822847843170166,
	"learning_rate": 0.00012048417132216015,
	"loss": 2.9762,
	"step": 4700
	},
	{
	"epoch": 1.2062222222222223,
	"grad_norm": 4.770782947540283,
	"learning_rate": 0.0001196377179617403,
	"loss": 2.9744,
	"step": 4750
	},
	{
	"epoch": 1.218920634920635,
	"grad_norm": 5.113085746765137,
	"learning_rate": 0.00011879126460132046,
	"loss": 2.9761,
	"step": 4800
	},
	{
	"epoch": 1.2316190476190476,
	"grad_norm": 6.109035491943359,
	"learning_rate": 0.00011796174030810902,
	"loss": 2.934,
	"step": 4850
	},
	{
	"epoch": 1.2443174603174603,
	"grad_norm": 5.884860038757324,
	"learning_rate": 0.00011711528694768918,
	"loss": 2.9369,
	"step": 4900
	},
	{
	"epoch": 1.257015873015873,
	"grad_norm": 7.224523544311523,
	"learning_rate": 0.00011626883358726934,
	"loss": 2.9209,
	"step": 4950
	},
	{
	"epoch": 1.2697142857142858,
	"grad_norm": 5.234792232513428,
	"learning_rate": 0.0001154223802268495,
	"loss": 2.9331,
	"step": 5000
	},
	{
	"epoch": 1.2824126984126984,
	"grad_norm": 4.842894554138184,
	"learning_rate": 0.00011457592686642966,
	"loss": 2.8986,
	"step": 5050
	},
	{
	"epoch": 1.295111111111111,
	"grad_norm": 4.660989284515381,
	"learning_rate": 0.00011372947350600981,
	"loss": 2.9141,
	"step": 5100
	},
	{
	"epoch": 1.3078095238095238,
	"grad_norm": 5.343238830566406,
	"learning_rate": 0.00011288302014558999,
	"loss": 2.8895,
	"step": 5150
	},
	{
	"epoch": 1.3205079365079366,
	"grad_norm": 5.187355041503906,
	"learning_rate": 0.00011203656678517014,
	"loss": 2.892,
	"step": 5200
	},
	{
	"epoch": 1.3332063492063493,
	"grad_norm": 4.856098175048828,
	"learning_rate": 0.0001111901134247503,
	"loss": 2.8736,
	"step": 5250
	},
	{
	"epoch": 1.345904761904762,
	"grad_norm": 4.733485698699951,
	"learning_rate": 0.00011034366006433046,
	"loss": 2.8567,
	"step": 5300
	},
	{
	"epoch": 1.3586031746031746,
	"grad_norm": 4.625833034515381,
	"learning_rate": 0.00010949720670391062,
	"loss": 2.8737,
	"step": 5350
	},
	{
	"epoch": 1.3713015873015872,
	"grad_norm": 4.856983184814453,
	"learning_rate": 0.00010865075334349078,
	"loss": 2.8417,
	"step": 5400
	},
	{
	"epoch": 1.384,
	"grad_norm": 5.2909932136535645,
	"learning_rate": 0.00010780429998307094,
	"loss": 2.8346,
	"step": 5450
	},
	{
	"epoch": 1.3966984126984128,
	"grad_norm": 4.909002780914307,
	"learning_rate": 0.0001069578466226511,
	"loss": 2.8307,
	"step": 5500
	},
	{
	"epoch": 1.4093968253968254,
	"grad_norm": 4.69639778137207,
	"learning_rate": 0.00010611139326223125,
	"loss": 2.8138,
	"step": 5550
	},
	{
	"epoch": 1.422095238095238,
	"grad_norm": 4.822878837585449,
	"learning_rate": 0.00010526493990181141,
	"loss": 2.8071,
	"step": 5600
	},
	{
	"epoch": 1.4347936507936507,
	"grad_norm": 5.602210998535156,
	"learning_rate": 0.00010441848654139157,
	"loss": 2.8128,
	"step": 5650
	},
	{
	"epoch": 1.4474920634920636,
	"grad_norm": 4.855912208557129,
	"learning_rate": 0.00010357203318097173,
	"loss": 2.8066,
	"step": 5700
	},
	{
	"epoch": 1.4601904761904763,
	"grad_norm": 5.553136348724365,
	"learning_rate": 0.00010272557982055189,
	"loss": 2.7876,
	"step": 5750
	},
	{
	"epoch": 1.472888888888889,
	"grad_norm": 4.901633262634277,
	"learning_rate": 0.00010187912646013204,
	"loss": 2.7787,
	"step": 5800
	},
	{
	"epoch": 1.4855873015873016,
	"grad_norm": 6.1740217208862305,
	"learning_rate": 0.0001010326730997122,
	"loss": 2.7704,
	"step": 5850
	},
	{
	"epoch": 1.4982857142857142,
	"grad_norm": 5.393040180206299,
	"learning_rate": 0.00010018621973929236,
	"loss": 2.7753,
	"step": 5900
	},
	{
	"epoch": 1.5109841269841269,
	"grad_norm": 6.0959930419921875,
	"learning_rate": 9.933976637887253e-05,
	"loss": 2.7538,
	"step": 5950
	},
	{
	"epoch": 1.5236825396825395,
	"grad_norm": 4.659241199493408,
	"learning_rate": 9.849331301845269e-05,
	"loss": 2.7517,
	"step": 6000
	},
	{
	"epoch": 1.5363809523809524,
	"grad_norm": 5.5795087814331055,
	"learning_rate": 9.766378872524125e-05,
	"loss": 2.7409,
	"step": 6050
	},
	{
	"epoch": 1.549079365079365,
	"grad_norm": 4.83104944229126,
	"learning_rate": 9.681733536482141e-05,
	"loss": 2.7358,
	"step": 6100
	},
	{
	"epoch": 1.561777777777778,
	"grad_norm": 5.035250663757324,
	"learning_rate": 9.597088200440157e-05,
	"loss": 2.7236,
	"step": 6150
	},
	{
	"epoch": 1.5744761904761906,
	"grad_norm": 5.167687892913818,
	"learning_rate": 9.512442864398172e-05,
	"loss": 2.7232,
	"step": 6200
	},
	{
	"epoch": 1.5871746031746032,
	"grad_norm": 5.0377326011657715,
	"learning_rate": 9.427797528356188e-05,
	"loss": 2.7368,
	"step": 6250
	},
	{
	"epoch": 1.599873015873016,
	"grad_norm": 4.893152713775635,
	"learning_rate": 9.343152192314204e-05,
	"loss": 2.6973,
	"step": 6300
	},
	{
	"epoch": 1.6125714285714285,
	"grad_norm": 5.246462345123291,
	"learning_rate": 9.25850685627222e-05,
	"loss": 2.6858,
	"step": 6350
	},
	{
	"epoch": 1.6252698412698412,
	"grad_norm": 5.26235294342041,
	"learning_rate": 9.173861520230236e-05,
	"loss": 2.6816,
	"step": 6400
	},
	{
	"epoch": 1.6379682539682539,
	"grad_norm": 4.8995513916015625,
	"learning_rate": 9.089216184188252e-05,
	"loss": 2.6865,
	"step": 6450
	},
	{
	"epoch": 1.6506666666666665,
	"grad_norm": 5.598567962646484,
	"learning_rate": 9.004570848146267e-05,
	"loss": 2.6675,
	"step": 6500
	},
	{
	"epoch": 1.6633650793650794,
	"grad_norm": 5.423081874847412,
	"learning_rate": 8.919925512104283e-05,
	"loss": 2.66,
	"step": 6550
	},
	{
	"epoch": 1.676063492063492,
	"grad_norm": 4.968945026397705,
	"learning_rate": 8.835280176062299e-05,
	"loss": 2.6629,
	"step": 6600
	},
	{
	"epoch": 1.688761904761905,
	"grad_norm": 6.054278373718262,
	"learning_rate": 8.750634840020315e-05,
	"loss": 2.6784,
	"step": 6650
	},
	{
	"epoch": 1.7014603174603176,
	"grad_norm": 5.279598712921143,
	"learning_rate": 8.665989503978331e-05,
	"loss": 2.6327,
	"step": 6700
	},
	{
	"epoch": 1.7141587301587302,
	"grad_norm": 5.150700092315674,
	"learning_rate": 8.581344167936347e-05,
	"loss": 2.6394,
	"step": 6750
	},
	{
	"epoch": 1.7268571428571429,
	"grad_norm": 5.459251403808594,
	"learning_rate": 8.496698831894362e-05,
	"loss": 2.644,
	"step": 6800
	},
	{
	"epoch": 1.7395555555555555,
	"grad_norm": 5.293938159942627,
	"learning_rate": 8.413746402573218e-05,
	"loss": 2.646,
	"step": 6850
	},
	{
	"epoch": 1.7522539682539682,
	"grad_norm": 5.72529411315918,
	"learning_rate": 8.329101066531234e-05,
	"loss": 2.624,
	"step": 6900
	},
	{
	"epoch": 1.7649523809523808,
	"grad_norm": 5.739988327026367,
	"learning_rate": 8.24445573048925e-05,
	"loss": 2.6175,
	"step": 6950
	},
	{
	"epoch": 1.7776507936507937,
	"grad_norm": 5.638957500457764,
	"learning_rate": 8.159810394447266e-05,
	"loss": 2.6236,
	"step": 7000
	},
	{
	"epoch": 1.7903492063492064,
	"grad_norm": 5.7885026931762695,
	"learning_rate": 8.075165058405282e-05,
	"loss": 2.5973,
	"step": 7050
	},
	{
	"epoch": 1.803047619047619,
	"grad_norm": 5.244924545288086,
	"learning_rate": 7.990519722363298e-05,
	"loss": 2.6031,
	"step": 7100
	},
	{
	"epoch": 1.8157460317460319,
	"grad_norm": 9.06462287902832,
	"learning_rate": 7.905874386321313e-05,
	"loss": 2.5768,
	"step": 7150
	},
	{
	"epoch": 1.8284444444444445,
	"grad_norm": 5.335842132568359,
	"learning_rate": 7.821229050279329e-05,
	"loss": 2.581,
	"step": 7200
	},
	{
	"epoch": 1.8411428571428572,
	"grad_norm": 5.182567596435547,
	"learning_rate": 7.736583714237345e-05,
	"loss": 2.5754,
	"step": 7250
	},
	{
	"epoch": 1.8538412698412698,
	"grad_norm": 5.487778186798096,
	"learning_rate": 7.651938378195361e-05,
	"loss": 2.5688,
	"step": 7300
	},
	{
	"epoch": 1.8665396825396825,
	"grad_norm": 5.46382474899292,
	"learning_rate": 7.567293042153377e-05,
	"loss": 2.5632,
	"step": 7350
	},
	{
	"epoch": 1.8792380952380952,
	"grad_norm": 5.17083740234375,
	"learning_rate": 7.482647706111392e-05,
	"loss": 2.55,
	"step": 7400
	},
	{
	"epoch": 1.8919365079365078,
	"grad_norm": 5.455732345581055,
	"learning_rate": 7.398002370069408e-05,
	"loss": 2.5509,
	"step": 7450
	},
	{
	"epoch": 1.9046349206349207,
	"grad_norm": 5.0072503089904785,
	"learning_rate": 7.313357034027426e-05,
	"loss": 2.5436,
	"step": 7500
	},
	{
	"epoch": 1.9173333333333333,
	"grad_norm": 5.0145087242126465,
	"learning_rate": 7.228711697985441e-05,
	"loss": 2.5416,
	"step": 7550
	},
	{
	"epoch": 1.930031746031746,
	"grad_norm": 5.2530364990234375,
	"learning_rate": 7.144066361943457e-05,
	"loss": 2.5234,
	"step": 7600
	},
	{
	"epoch": 1.9427301587301589,
	"grad_norm": 4.886019229888916,
	"learning_rate": 7.059421025901474e-05,
	"loss": 2.5447,
	"step": 7650
	},
	{
	"epoch": 1.9554285714285715,
	"grad_norm": 5.070368766784668,
	"learning_rate": 6.97477568985949e-05,
	"loss": 2.5308,
	"step": 7700
	},
	{
	"epoch": 1.9681269841269842,
	"grad_norm": 5.158459186553955,
	"learning_rate": 6.890130353817506e-05,
	"loss": 2.5265,
	"step": 7750
	},
	{
	"epoch": 1.9808253968253968,
	"grad_norm": 5.249716281890869,
	"learning_rate": 6.805485017775522e-05,
	"loss": 2.5079,
	"step": 7800
	},
	{
	"epoch": 1.9935238095238095,
	"grad_norm": 5.3184590339660645,
	"learning_rate": 6.720839681733538e-05,
	"loss": 2.5135,
	"step": 7850
	},
	{
	"epoch": 2.006095238095238,
	"grad_norm": 4.898834705352783,
	"learning_rate": 6.636194345691553e-05,
	"loss": 2.487,
	"step": 7900
	},
	{
	"epoch": 2.0187936507936506,
	"grad_norm": 4.718045234680176,
	"learning_rate": 6.55154900964957e-05,
	"loss": 2.504,
	"step": 7950
	},
	{
	"epoch": 2.0314920634920637,
	"grad_norm": 4.977246284484863,
	"learning_rate": 6.466903673607585e-05,
	"loss": 2.4924,
	"step": 8000
	},
	{
	"epoch": 2.0441904761904763,
	"grad_norm": 5.398455619812012,
	"learning_rate": 6.382258337565601e-05,
	"loss": 2.4839,
	"step": 8050
	},
	{
	"epoch": 2.056888888888889,
	"grad_norm": 6.387637138366699,
	"learning_rate": 6.297613001523617e-05,
	"loss": 2.4833,
	"step": 8100
	},
	{
	"epoch": 2.0695873015873016,
	"grad_norm": 5.588785648345947,
	"learning_rate": 6.212967665481633e-05,
	"loss": 2.4569,
	"step": 8150
	},
	{
	"epoch": 2.0822857142857143,
	"grad_norm": 6.301563262939453,
	"learning_rate": 6.128322329439648e-05,
	"loss": 2.4748,
	"step": 8200
	},
	{
	"epoch": 2.094984126984127,
	"grad_norm": 5.7610979080200195,
	"learning_rate": 6.043676993397664e-05,
	"loss": 2.4597,
	"step": 8250
	},
	{
	"epoch": 2.1076825396825396,
	"grad_norm": 5.260268211364746,
	"learning_rate": 5.95903165735568e-05,
	"loss": 2.4581,
	"step": 8300
	},
	{
	"epoch": 2.1203809523809523,
	"grad_norm": 5.712375640869141,
	"learning_rate": 5.874386321313696e-05,
	"loss": 2.4522,
	"step": 8350
	},
	{
	"epoch": 2.133079365079365,
	"grad_norm": 6.139365196228027,
	"learning_rate": 5.7897409852717125e-05,
	"loss": 2.4588,
	"step": 8400
	},
	{
	"epoch": 2.145777777777778,
	"grad_norm": 5.294638633728027,
	"learning_rate": 5.705095649229728e-05,
	"loss": 2.4467,
	"step": 8450
	},
	{
	"epoch": 2.1584761904761907,
	"grad_norm": 6.786096572875977,
	"learning_rate": 5.620450313187744e-05,
	"loss": 2.4538,
	"step": 8500
	},
	{
	"epoch": 2.1711746031746033,
	"grad_norm": 4.95149564743042,
	"learning_rate": 5.53580497714576e-05,
	"loss": 2.4478,
	"step": 8550
	},
	{
	"epoch": 2.183873015873016,
	"grad_norm": 4.965189456939697,
	"learning_rate": 5.451159641103776e-05,
	"loss": 2.4449,
	"step": 8600
	},
	{
	"epoch": 2.1965714285714286,
	"grad_norm": 5.404216766357422,
	"learning_rate": 5.3665143050617916e-05,
	"loss": 2.4447,
	"step": 8650
	},
	{
	"epoch": 2.2092698412698413,
	"grad_norm": 5.965044021606445,
	"learning_rate": 5.2818689690198075e-05,
	"loss": 2.4337,
	"step": 8700
	},
	{
	"epoch": 2.221968253968254,
	"grad_norm": 6.933143615722656,
	"learning_rate": 5.197223632977823e-05,
	"loss": 2.4105,
	"step": 8750
	},
	{
	"epoch": 2.2346666666666666,
	"grad_norm": 4.795731067657471,
	"learning_rate": 5.112578296935839e-05,
	"loss": 2.4192,
	"step": 8800
	},
	{
	"epoch": 2.2473650793650792,
	"grad_norm": 5.221839427947998,
	"learning_rate": 5.027932960893855e-05,
	"loss": 2.4232,
	"step": 8850
	},
	{
	"epoch": 2.260063492063492,
	"grad_norm": 5.512608528137207,
	"learning_rate": 4.9432876248518715e-05,
	"loss": 2.414,
	"step": 8900
	},
	{
	"epoch": 2.2727619047619045,
	"grad_norm": 5.804533958435059,
	"learning_rate": 4.858642288809887e-05,
	"loss": 2.4088,
	"step": 8950
	},
	{
	"epoch": 2.2854603174603176,
	"grad_norm": 5.301205635070801,
	"learning_rate": 4.773996952767903e-05,
	"loss": 2.4074,
	"step": 9000
	},
	{
	"epoch": 2.2981587301587303,
	"grad_norm": 5.4529290199279785,
	"learning_rate": 4.689351616725919e-05,
	"loss": 2.3906,
	"step": 9050
	},
	{
	"epoch": 2.310857142857143,
	"grad_norm": 5.1715006828308105,
	"learning_rate": 4.604706280683935e-05,
	"loss": 2.4082,
	"step": 9100
	},
	{
	"epoch": 2.3235555555555556,
	"grad_norm": 5.739888668060303,
	"learning_rate": 4.5200609446419506e-05,
	"loss": 2.3982,
	"step": 9150
	},
	{
	"epoch": 2.3362539682539682,
	"grad_norm": 5.454600811004639,
	"learning_rate": 4.4354156085999664e-05,
	"loss": 2.3947,
	"step": 9200
	},
	{
	"epoch": 2.348952380952381,
	"grad_norm": 5.671194553375244,
	"learning_rate": 4.350770272557982e-05,
	"loss": 2.3984,
	"step": 9250
	},
	{
	"epoch": 2.3616507936507936,
	"grad_norm": 5.695377349853516,
	"learning_rate": 4.266124936515998e-05,
	"loss": 2.3732,
	"step": 9300
	},
	{
	"epoch": 2.374349206349206,
	"grad_norm": 5.454712390899658,
	"learning_rate": 4.181479600474014e-05,
	"loss": 2.3769,
	"step": 9350
	},
	{
	"epoch": 2.387047619047619,
	"grad_norm": 5.6516499519348145,
	"learning_rate": 4.0968342644320304e-05,
	"loss": 2.3788,
	"step": 9400
	},
	{
	"epoch": 2.399746031746032,
	"grad_norm": 5.617581367492676,
	"learning_rate": 4.012188928390046e-05,
	"loss": 2.3754,
	"step": 9450
	},
	{
	"epoch": 2.4124444444444446,
	"grad_norm": 5.995534420013428,
	"learning_rate": 3.927543592348062e-05,
	"loss": 2.3647,
	"step": 9500
	},
	{
	"epoch": 2.4251428571428573,
	"grad_norm": 4.861730098724365,
	"learning_rate": 3.842898256306078e-05,
	"loss": 2.3699,
	"step": 9550
	},
	{
	"epoch": 2.43784126984127,
	"grad_norm": 5.109068393707275,
	"learning_rate": 3.758252920264094e-05,
	"loss": 2.3709,
	"step": 9600
	},
	{
	"epoch": 2.4505396825396826,
	"grad_norm": 5.989678382873535,
	"learning_rate": 3.6736075842221096e-05,
	"loss": 2.3442,
	"step": 9650
	},
	{
	"epoch": 2.4632380952380952,
	"grad_norm": 5.233463764190674,
	"learning_rate": 3.5889622481801254e-05,
	"loss": 2.3567,
	"step": 9700
	},
	{
	"epoch": 2.475936507936508,
	"grad_norm": 4.930139541625977,
	"learning_rate": 3.504316912138141e-05,
	"loss": 2.3334,
	"step": 9750
	},
	{
	"epoch": 2.4886349206349205,
	"grad_norm": 5.235612392425537,
	"learning_rate": 3.419671576096157e-05,
	"loss": 2.3523,
	"step": 9800
	},
	{
	"epoch": 2.501333333333333,
	"grad_norm": 7.875730991363525,
	"learning_rate": 3.335026240054173e-05,
	"loss": 2.3508,
	"step": 9850
	},
	{
	"epoch": 2.514031746031746,
	"grad_norm": 5.479938507080078,
	"learning_rate": 3.2503809040121894e-05,
	"loss": 2.3579,
	"step": 9900
	},
	{
	"epoch": 2.5267301587301585,
	"grad_norm": 4.948204040527344,
	"learning_rate": 3.165735567970205e-05,
	"loss": 2.3475,
	"step": 9950
	},
	{
	"epoch": 2.5394285714285716,
	"grad_norm": 5.541229724884033,
	"learning_rate": 3.081090231928221e-05,
	"loss": 2.3276,
	"step": 10000
	},
	{
	"epoch": 2.5521269841269842,
	"grad_norm": 5.418817043304443,
	"learning_rate": 2.996444895886237e-05,
	"loss": 2.3433,
	"step": 10050
	},
	{
	"epoch": 2.564825396825397,
	"grad_norm": 7.228455543518066,
	"learning_rate": 2.9117995598442527e-05,
	"loss": 2.3274,
	"step": 10100
	},
	{
	"epoch": 2.5775238095238096,
	"grad_norm": 5.323376655578613,
	"learning_rate": 2.8271542238022686e-05,
	"loss": 2.3404,
	"step": 10150
	},
	{
	"epoch": 2.590222222222222,
	"grad_norm": 5.080998420715332,
	"learning_rate": 2.7425088877602844e-05,
	"loss": 2.3164,
	"step": 10200
	},
	{
	"epoch": 2.602920634920635,
	"grad_norm": 5.0400285720825195,
	"learning_rate": 2.6578635517183002e-05,
	"loss": 2.3301,
	"step": 10250
	},
	{
	"epoch": 2.6156190476190475,
	"grad_norm": 5.519168853759766,
	"learning_rate": 2.5732182156763164e-05,
	"loss": 2.3206,
	"step": 10300
	},
	{
	"epoch": 2.62831746031746,
	"grad_norm": 5.184562683105469,
	"learning_rate": 2.4885728796343322e-05,
	"loss": 2.3197,
	"step": 10350
	},
	{
	"epoch": 2.6410158730158733,
	"grad_norm": 5.173785209655762,
	"learning_rate": 2.403927543592348e-05,
	"loss": 2.3176,
	"step": 10400
	},
	{
	"epoch": 2.653714285714286,
	"grad_norm": 5.67647647857666,
	"learning_rate": 2.319282207550364e-05,
	"loss": 2.3154,
	"step": 10450
	},
	{
	"epoch": 2.6664126984126986,
	"grad_norm": 6.398087978363037,
	"learning_rate": 2.2346368715083797e-05,
	"loss": 2.3164,
	"step": 10500
	},
	{
	"epoch": 2.679111111111111,
	"grad_norm": 5.975333213806152,
	"learning_rate": 2.149991535466396e-05,
	"loss": 2.314,
	"step": 10550
	},
	{
	"epoch": 2.691809523809524,
	"grad_norm": 5.434169292449951,
	"learning_rate": 2.065346199424412e-05,
	"loss": 2.3034,
	"step": 10600
	},
	{
	"epoch": 2.7045079365079365,
	"grad_norm": 5.366811275482178,
	"learning_rate": 1.980700863382428e-05,
	"loss": 2.3161,
	"step": 10650
	},
	{
	"epoch": 2.717206349206349,
	"grad_norm": 6.124394416809082,
	"learning_rate": 1.8960555273404437e-05,
	"loss": 2.3031,
	"step": 10700
	},
	{
	"epoch": 2.729904761904762,
	"grad_norm": 6.769460201263428,
	"learning_rate": 1.8114101912984595e-05,
	"loss": 2.2893,
	"step": 10750
	},
	{
	"epoch": 2.7426031746031745,
	"grad_norm": 4.68062162399292,
	"learning_rate": 1.7267648552564754e-05,
	"loss": 2.3194,
	"step": 10800
	},
	{
	"epoch": 2.755301587301587,
	"grad_norm": 5.621355056762695,
	"learning_rate": 1.6421195192144915e-05,
	"loss": 2.3095,
	"step": 10850
	},
	{
	"epoch": 2.768,
	"grad_norm": 5.693627834320068,
	"learning_rate": 1.5574741831725074e-05,
	"loss": 2.3029,
	"step": 10900
	},
	{
	"epoch": 2.7806984126984124,
	"grad_norm": 5.223197937011719,
	"learning_rate": 1.4728288471305232e-05,
	"loss": 2.2965,
	"step": 10950
	},
	{
	"epoch": 2.7933968253968255,
	"grad_norm": 5.495180606842041,
	"learning_rate": 1.388183511088539e-05,
	"loss": 2.3037,
	"step": 11000
	},
	{
	"epoch": 2.806095238095238,
	"grad_norm": 5.025885105133057,
	"learning_rate": 1.303538175046555e-05,
	"loss": 2.2896,
	"step": 11050
	},
	{
	"epoch": 2.818793650793651,
	"grad_norm": 5.007611274719238,
	"learning_rate": 1.2188928390045709e-05,
	"loss": 2.2878,
	"step": 11100
	},
	{
	"epoch": 2.8314920634920635,
	"grad_norm": 4.786118984222412,
	"learning_rate": 1.1342475029625869e-05,
	"loss": 2.2823,
	"step": 11150
	},
	{
	"epoch": 2.844190476190476,
	"grad_norm": 5.517834663391113,
	"learning_rate": 1.0496021669206027e-05,
	"loss": 2.2796,
	"step": 11200
	},
	{
	"epoch": 2.856888888888889,
	"grad_norm": 7.0708818435668945,
	"learning_rate": 9.649568308786185e-06,
	"loss": 2.2883,
	"step": 11250
	},
	{
	"epoch": 2.8695873015873015,
	"grad_norm": 5.164283275604248,
	"learning_rate": 8.803114948366345e-06,
	"loss": 2.2921,
	"step": 11300
	},
	{
	"epoch": 2.8822857142857146,
	"grad_norm": 5.082614421844482,
	"learning_rate": 7.956661587946503e-06,
	"loss": 2.2956,
	"step": 11350
	},
	{
	"epoch": 2.894984126984127,
	"grad_norm": 5.358335018157959,
	"learning_rate": 7.110208227526663e-06,
	"loss": 2.29,
	"step": 11400
	},
	{
	"epoch": 2.90768253968254,
	"grad_norm": 4.937663555145264,
	"learning_rate": 6.2637548671068235e-06,
	"loss": 2.2912,
	"step": 11450
	},
	{
	"epoch": 2.9203809523809525,
	"grad_norm": 4.954619407653809,
	"learning_rate": 5.417301506686982e-06,
	"loss": 2.2946,
	"step": 11500
	},
	{
	"epoch": 2.933079365079365,
	"grad_norm": 5.406091690063477,
	"learning_rate": 4.570848146267141e-06,
	"loss": 2.2903,
	"step": 11550
	},
	{
	"epoch": 2.945777777777778,
	"grad_norm": 5.957233428955078,
	"learning_rate": 3.7243947858473e-06,
	"loss": 2.2808,
	"step": 11600
	},
	{
	"epoch": 2.9584761904761905,
	"grad_norm": 5.3814215660095215,
	"learning_rate": 2.8779414254274592e-06,
	"loss": 2.2918,
	"step": 11650
	},
	{
	"epoch": 2.971174603174603,
	"grad_norm": 7.456835746765137,
	"learning_rate": 2.0314880650076184e-06,
	"loss": 2.2871,
	"step": 11700
	},
	{
	"epoch": 2.983873015873016,
	"grad_norm": 5.577419757843018,
	"learning_rate": 1.1850347045877773e-06,
	"loss": 2.2791,
	"step": 11750
	},
	{
	"epoch": 2.9965714285714284,
	"grad_norm": 5.656188488006592,
	"learning_rate": 3.3858134416793636e-07,
	"loss": 2.2811,
	"step": 11800
	},
	{
	"epoch": 3.0,
	"step": 11814,
	"total_flos": 5952688007980032.0,
	"train_loss": 3.0341684326254534,
	"train_runtime": 5229.0251,
	"train_samples_per_second": 72.289,
	"train_steps_per_second": 2.259
	}
	],
	"logging_steps": 50,
	"max_steps": 11814,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 3,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 5952688007980032.0,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}