{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 177, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 11.399982724065612, "learning_rate": 0.0, "loss": 0.5723, "step": 1 }, { "epoch": 0.03389830508474576, "grad_norm": 15.996953522086482, "learning_rate": 6.25e-07, "loss": 0.6919, "step": 2 }, { "epoch": 0.05084745762711865, "grad_norm": 12.017237167498154, "learning_rate": 1.25e-06, "loss": 0.5464, "step": 3 }, { "epoch": 0.06779661016949153, "grad_norm": 9.524531515096024, "learning_rate": 1.8750000000000003e-06, "loss": 0.4571, "step": 4 }, { "epoch": 0.0847457627118644, "grad_norm": 10.11840992474023, "learning_rate": 2.5e-06, "loss": 0.512, "step": 5 }, { "epoch": 0.1016949152542373, "grad_norm": 8.291307792118173, "learning_rate": 3.125e-06, "loss": 0.4744, "step": 6 }, { "epoch": 0.11864406779661017, "grad_norm": 4.947811089030161, "learning_rate": 3.7500000000000005e-06, "loss": 0.4064, "step": 7 }, { "epoch": 0.13559322033898305, "grad_norm": 3.957348877914598, "learning_rate": 4.3750000000000005e-06, "loss": 0.4147, "step": 8 }, { "epoch": 0.15254237288135594, "grad_norm": 1.878974520193955, "learning_rate": 5e-06, "loss": 0.3625, "step": 9 }, { "epoch": 0.1694915254237288, "grad_norm": 1.685893632116404, "learning_rate": 4.999568059583401e-06, "loss": 0.3652, "step": 10 }, { "epoch": 0.1864406779661017, "grad_norm": 0.8038396125458356, "learning_rate": 4.998272387591625e-06, "loss": 0.2205, "step": 11 }, { "epoch": 0.2033898305084746, "grad_norm": 1.1469245810216282, "learning_rate": 4.99611343174715e-06, "loss": 0.3542, "step": 12 }, { "epoch": 0.22033898305084745, "grad_norm": 1.1786630363061648, "learning_rate": 4.993091938082206e-06, "loss": 0.3458, "step": 13 }, { "epoch": 0.23728813559322035, "grad_norm": 1.1958089208385483, "learning_rate": 4.989208950680979e-06, "loss": 0.3565, "step": 14 }, { "epoch": 0.2542372881355932, "grad_norm": 0.9163324740544867, "learning_rate": 4.984465811318826e-06, "loss": 0.3015, "step": 15 }, { "epoch": 0.2711864406779661, "grad_norm": 0.7719232841825565, "learning_rate": 4.97886415899862e-06, "loss": 0.3175, "step": 16 }, { "epoch": 0.288135593220339, "grad_norm": 0.7319081785361947, "learning_rate": 4.972405929384391e-06, "loss": 0.3195, "step": 17 }, { "epoch": 0.3050847457627119, "grad_norm": 0.7002551472674452, "learning_rate": 4.965093354132451e-06, "loss": 0.3074, "step": 18 }, { "epoch": 0.3220338983050847, "grad_norm": 0.6366390198837392, "learning_rate": 4.9569289601202405e-06, "loss": 0.287, "step": 19 }, { "epoch": 0.3389830508474576, "grad_norm": 0.48511268745049707, "learning_rate": 4.9479155685731595e-06, "loss": 0.2634, "step": 20 }, { "epoch": 0.3559322033898305, "grad_norm": 0.5725593300080952, "learning_rate": 4.938056294089689e-06, "loss": 0.2718, "step": 21 }, { "epoch": 0.3728813559322034, "grad_norm": 0.5881100232575105, "learning_rate": 4.927354543565131e-06, "loss": 0.2813, "step": 22 }, { "epoch": 0.3898305084745763, "grad_norm": 0.6954472249924972, "learning_rate": 4.915814015014349e-06, "loss": 0.2829, "step": 23 }, { "epoch": 0.4067796610169492, "grad_norm": 1.7773763839304515, "learning_rate": 4.90343869629391e-06, "loss": 0.2961, "step": 24 }, { "epoch": 0.423728813559322, "grad_norm": 0.7136095313835628, "learning_rate": 4.890232863724075e-06, "loss": 0.314, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 0.5965674482281343, "learning_rate": 4.8762010806111e-06, "loss": 0.305, "step": 26 }, { "epoch": 0.4576271186440678, "grad_norm": 0.5426262579836809, "learning_rate": 4.861348195670381e-06, "loss": 0.2904, "step": 27 }, { "epoch": 0.4745762711864407, "grad_norm": 0.6232000750718879, "learning_rate": 4.845679341350963e-06, "loss": 0.3194, "step": 28 }, { "epoch": 0.4915254237288136, "grad_norm": 0.5521136750290783, "learning_rate": 4.8291999320620185e-06, "loss": 0.3016, "step": 29 }, { "epoch": 0.5084745762711864, "grad_norm": 0.5746062885940153, "learning_rate": 4.811915662301877e-06, "loss": 0.2648, "step": 30 }, { "epoch": 0.5254237288135594, "grad_norm": 0.607102290977902, "learning_rate": 4.793832504690283e-06, "loss": 0.299, "step": 31 }, { "epoch": 0.5423728813559322, "grad_norm": 0.49962250759355686, "learning_rate": 4.774956707904542e-06, "loss": 0.2515, "step": 32 }, { "epoch": 0.559322033898305, "grad_norm": 0.4701577356569123, "learning_rate": 4.755294794520277e-06, "loss": 0.2491, "step": 33 }, { "epoch": 0.576271186440678, "grad_norm": 0.7858423854608445, "learning_rate": 4.734853558757534e-06, "loss": 0.2827, "step": 34 }, { "epoch": 0.5932203389830508, "grad_norm": 0.49539465146154066, "learning_rate": 4.7136400641330245e-06, "loss": 0.2844, "step": 35 }, { "epoch": 0.6101694915254238, "grad_norm": 0.4775787849669313, "learning_rate": 4.691661641019316e-06, "loss": 0.2587, "step": 36 }, { "epoch": 0.6271186440677966, "grad_norm": 0.6390518183116327, "learning_rate": 4.6689258841117946e-06, "loss": 0.2926, "step": 37 }, { "epoch": 0.6440677966101694, "grad_norm": 0.5314073848549145, "learning_rate": 4.64544064980431e-06, "loss": 0.2485, "step": 38 }, { "epoch": 0.6610169491525424, "grad_norm": 0.4020113800366879, "learning_rate": 4.621214053474374e-06, "loss": 0.2132, "step": 39 }, { "epoch": 0.6779661016949152, "grad_norm": 0.5353215315551882, "learning_rate": 4.596254466678877e-06, "loss": 0.2788, "step": 40 }, { "epoch": 0.6949152542372882, "grad_norm": 0.4769490074416754, "learning_rate": 4.570570514261272e-06, "loss": 0.2703, "step": 41 }, { "epoch": 0.711864406779661, "grad_norm": 0.5249549778546513, "learning_rate": 4.544171071371246e-06, "loss": 0.2701, "step": 42 }, { "epoch": 0.7288135593220338, "grad_norm": 0.5163626189073403, "learning_rate": 4.517065260397887e-06, "loss": 0.2864, "step": 43 }, { "epoch": 0.7457627118644068, "grad_norm": 0.5413634640003411, "learning_rate": 4.489262447817421e-06, "loss": 0.2568, "step": 44 }, { "epoch": 0.7627118644067796, "grad_norm": 0.5149640991984162, "learning_rate": 4.460772240956609e-06, "loss": 0.2525, "step": 45 }, { "epoch": 0.7796610169491526, "grad_norm": 0.445354494671823, "learning_rate": 4.431604484672905e-06, "loss": 0.2172, "step": 46 }, { "epoch": 0.7966101694915254, "grad_norm": 0.5009716150531464, "learning_rate": 4.401769257952551e-06, "loss": 0.2624, "step": 47 }, { "epoch": 0.8135593220338984, "grad_norm": 0.5140310721121815, "learning_rate": 4.3712768704277535e-06, "loss": 0.2781, "step": 48 }, { "epoch": 0.8305084745762712, "grad_norm": 0.5467147233990847, "learning_rate": 4.340137858814168e-06, "loss": 0.244, "step": 49 }, { "epoch": 0.847457627118644, "grad_norm": 0.4862542975816897, "learning_rate": 4.308362983269916e-06, "loss": 0.226, "step": 50 }, { "epoch": 0.864406779661017, "grad_norm": 0.5701344768833954, "learning_rate": 4.275963223677379e-06, "loss": 0.3184, "step": 51 }, { "epoch": 0.8813559322033898, "grad_norm": 0.5680722480199585, "learning_rate": 4.242949775849083e-06, "loss": 0.2723, "step": 52 }, { "epoch": 0.8983050847457628, "grad_norm": 0.6081264789001348, "learning_rate": 4.209334047658956e-06, "loss": 0.2686, "step": 53 }, { "epoch": 0.9152542372881356, "grad_norm": 0.46308625878084975, "learning_rate": 4.175127655100306e-06, "loss": 0.2763, "step": 54 }, { "epoch": 0.9322033898305084, "grad_norm": 0.41899355255330367, "learning_rate": 4.140342418271897e-06, "loss": 0.2555, "step": 55 }, { "epoch": 0.9491525423728814, "grad_norm": 0.5677964936366232, "learning_rate": 4.104990357293478e-06, "loss": 0.2808, "step": 56 }, { "epoch": 0.9661016949152542, "grad_norm": 0.5450969329455534, "learning_rate": 4.069083688152206e-06, "loss": 0.2464, "step": 57 }, { "epoch": 0.9830508474576272, "grad_norm": 0.49742989815197775, "learning_rate": 4.032634818481382e-06, "loss": 0.2435, "step": 58 }, { "epoch": 1.0, "grad_norm": 0.47791981816360296, "learning_rate": 3.995656343272969e-06, "loss": 0.2431, "step": 59 }, { "epoch": 1.0169491525423728, "grad_norm": 0.5336552382444577, "learning_rate": 3.958161040525354e-06, "loss": 0.2608, "step": 60 }, { "epoch": 1.0338983050847457, "grad_norm": 0.6255845843539747, "learning_rate": 3.92016186682789e-06, "loss": 0.2718, "step": 61 }, { "epoch": 1.0508474576271187, "grad_norm": 0.5386071690484918, "learning_rate": 3.88167195288371e-06, "loss": 0.2472, "step": 62 }, { "epoch": 1.0677966101694916, "grad_norm": 0.4610208030538615, "learning_rate": 3.842704598972384e-06, "loss": 0.2711, "step": 63 }, { "epoch": 1.0847457627118644, "grad_norm": 0.5005083514691049, "learning_rate": 3.80327327035398e-06, "loss": 0.243, "step": 64 }, { "epoch": 1.1016949152542372, "grad_norm": 0.5256738643041071, "learning_rate": 3.763391592616104e-06, "loss": 0.2304, "step": 65 }, { "epoch": 1.11864406779661, "grad_norm": 0.4461147519625525, "learning_rate": 3.7230733469655554e-06, "loss": 0.257, "step": 66 }, { "epoch": 1.1355932203389831, "grad_norm": 0.5616719506601695, "learning_rate": 3.6823324654661923e-06, "loss": 0.2695, "step": 67 }, { "epoch": 1.152542372881356, "grad_norm": 0.5267858101125528, "learning_rate": 3.6411830262246755e-06, "loss": 0.2626, "step": 68 }, { "epoch": 1.1694915254237288, "grad_norm": 0.44572869985154356, "learning_rate": 3.599639248525749e-06, "loss": 0.2314, "step": 69 }, { "epoch": 1.1864406779661016, "grad_norm": 0.4743820453627211, "learning_rate": 3.5577154879187286e-06, "loss": 0.2524, "step": 70 }, { "epoch": 1.2033898305084745, "grad_norm": 0.4795920562830289, "learning_rate": 3.5154262312569134e-06, "loss": 0.2272, "step": 71 }, { "epoch": 1.2203389830508475, "grad_norm": 0.4454617572461806, "learning_rate": 3.4727860916916143e-06, "loss": 0.2192, "step": 72 }, { "epoch": 1.2372881355932204, "grad_norm": 0.45505003053894033, "learning_rate": 3.429809803622551e-06, "loss": 0.2222, "step": 73 }, { "epoch": 1.2542372881355932, "grad_norm": 0.5092981797760533, "learning_rate": 3.386512217606339e-06, "loss": 0.2221, "step": 74 }, { "epoch": 1.271186440677966, "grad_norm": 0.45652766088591107, "learning_rate": 3.342908295224854e-06, "loss": 0.2115, "step": 75 }, { "epoch": 1.288135593220339, "grad_norm": 0.5773859878970916, "learning_rate": 3.299013103915214e-06, "loss": 0.2735, "step": 76 }, { "epoch": 1.305084745762712, "grad_norm": 0.5403811470495191, "learning_rate": 3.2548418117631952e-06, "loss": 0.2248, "step": 77 }, { "epoch": 1.3220338983050848, "grad_norm": 0.6646747594835315, "learning_rate": 3.2104096822618657e-06, "loss": 0.2554, "step": 78 }, { "epoch": 1.3389830508474576, "grad_norm": 0.44285218067378557, "learning_rate": 3.1657320690372464e-06, "loss": 0.1984, "step": 79 }, { "epoch": 1.3559322033898304, "grad_norm": 0.48055286049444484, "learning_rate": 3.120824410542833e-06, "loss": 0.2187, "step": 80 }, { "epoch": 1.3728813559322033, "grad_norm": 0.5320390743973299, "learning_rate": 3.0757022247248e-06, "loss": 0.2232, "step": 81 }, { "epoch": 1.3898305084745763, "grad_norm": 0.4807624431716425, "learning_rate": 3.0303811036597395e-06, "loss": 0.2403, "step": 82 }, { "epoch": 1.4067796610169492, "grad_norm": 0.4244464101592162, "learning_rate": 2.9848767081667823e-06, "loss": 0.2213, "step": 83 }, { "epoch": 1.423728813559322, "grad_norm": 0.49236319422304126, "learning_rate": 2.9392047623959653e-06, "loss": 0.2375, "step": 84 }, { "epoch": 1.4406779661016949, "grad_norm": 0.46784645507056544, "learning_rate": 2.8933810483947156e-06, "loss": 0.2219, "step": 85 }, { "epoch": 1.457627118644068, "grad_norm": 0.48973007159992155, "learning_rate": 2.8474214006543255e-06, "loss": 0.2166, "step": 86 }, { "epoch": 1.4745762711864407, "grad_norm": 0.3510113111902409, "learning_rate": 2.8013417006383078e-06, "loss": 0.1943, "step": 87 }, { "epoch": 1.4915254237288136, "grad_norm": 0.44578327104858334, "learning_rate": 2.755157871294521e-06, "loss": 0.234, "step": 88 }, { "epoch": 1.5084745762711864, "grad_norm": 0.7165659629002311, "learning_rate": 2.708885871552954e-06, "loss": 0.2447, "step": 89 }, { "epoch": 1.5254237288135593, "grad_norm": 0.45065744754362286, "learning_rate": 2.6625416908110825e-06, "loss": 0.2284, "step": 90 }, { "epoch": 1.542372881355932, "grad_norm": 0.41704260341071947, "learning_rate": 2.616141343408696e-06, "loss": 0.2237, "step": 91 }, { "epoch": 1.559322033898305, "grad_norm": 0.5042304558688149, "learning_rate": 2.569700863094104e-06, "loss": 0.2219, "step": 92 }, { "epoch": 1.576271186440678, "grad_norm": 0.5190696344795624, "learning_rate": 2.5232362974836394e-06, "loss": 0.2321, "step": 93 }, { "epoch": 1.5932203389830508, "grad_norm": 0.5279441416014571, "learning_rate": 2.4767637025163614e-06, "loss": 0.2715, "step": 94 }, { "epoch": 1.6101694915254239, "grad_norm": 0.5147780575565639, "learning_rate": 2.4302991369058963e-06, "loss": 0.2445, "step": 95 }, { "epoch": 1.6271186440677967, "grad_norm": 0.4916874716282566, "learning_rate": 2.3838586565913053e-06, "loss": 0.2045, "step": 96 }, { "epoch": 1.6440677966101696, "grad_norm": 0.4829870763326134, "learning_rate": 2.3374583091889188e-06, "loss": 0.2154, "step": 97 }, { "epoch": 1.6610169491525424, "grad_norm": 0.5093333087244544, "learning_rate": 2.2911141284470466e-06, "loss": 0.2453, "step": 98 }, { "epoch": 1.6779661016949152, "grad_norm": 0.6588755855029546, "learning_rate": 2.2448421287054794e-06, "loss": 0.2668, "step": 99 }, { "epoch": 1.694915254237288, "grad_norm": 0.49865992979225093, "learning_rate": 2.1986582993616926e-06, "loss": 0.2283, "step": 100 }, { "epoch": 1.711864406779661, "grad_norm": 0.37695864859441414, "learning_rate": 2.1525785993456753e-06, "loss": 0.2057, "step": 101 }, { "epoch": 1.7288135593220337, "grad_norm": 0.548172589959634, "learning_rate": 2.1066189516052848e-06, "loss": 0.2558, "step": 102 }, { "epoch": 1.7457627118644068, "grad_norm": 0.7296309615305285, "learning_rate": 2.0607952376040355e-06, "loss": 0.2707, "step": 103 }, { "epoch": 1.7627118644067796, "grad_norm": 0.6191305523245286, "learning_rate": 2.0151232918332186e-06, "loss": 0.216, "step": 104 }, { "epoch": 1.7796610169491527, "grad_norm": 0.4275148733512332, "learning_rate": 1.9696188963402613e-06, "loss": 0.2016, "step": 105 }, { "epoch": 1.7966101694915255, "grad_norm": 0.5433825312095228, "learning_rate": 1.9242977752752006e-06, "loss": 0.2463, "step": 106 }, { "epoch": 1.8135593220338984, "grad_norm": 0.6146764112328339, "learning_rate": 1.879175589457168e-06, "loss": 0.217, "step": 107 }, { "epoch": 1.8305084745762712, "grad_norm": 0.4492249547863865, "learning_rate": 1.8342679309627545e-06, "loss": 0.2112, "step": 108 }, { "epoch": 1.847457627118644, "grad_norm": 0.45112403138122764, "learning_rate": 1.7895903177381351e-06, "loss": 0.2213, "step": 109 }, { "epoch": 1.8644067796610169, "grad_norm": 0.6114123860260355, "learning_rate": 1.7451581882368052e-06, "loss": 0.2067, "step": 110 }, { "epoch": 1.8813559322033897, "grad_norm": 0.47090009128241844, "learning_rate": 1.700986896084787e-06, "loss": 0.23, "step": 111 }, { "epoch": 1.8983050847457628, "grad_norm": 0.4641120289884631, "learning_rate": 1.6570917047751465e-06, "loss": 0.2385, "step": 112 }, { "epoch": 1.9152542372881356, "grad_norm": 0.6393272872227032, "learning_rate": 1.613487782393661e-06, "loss": 0.2196, "step": 113 }, { "epoch": 1.9322033898305084, "grad_norm": 0.41826816207678297, "learning_rate": 1.5701901963774504e-06, "loss": 0.2031, "step": 114 }, { "epoch": 1.9491525423728815, "grad_norm": 0.43623407997620883, "learning_rate": 1.5272139083083865e-06, "loss": 0.2168, "step": 115 }, { "epoch": 1.9661016949152543, "grad_norm": 0.5121666691791417, "learning_rate": 1.4845737687430875e-06, "loss": 0.2476, "step": 116 }, { "epoch": 1.9830508474576272, "grad_norm": 0.4875537170599579, "learning_rate": 1.4422845120812718e-06, "loss": 0.1885, "step": 117 }, { "epoch": 2.0, "grad_norm": 0.6170831997594528, "learning_rate": 1.400360751474253e-06, "loss": 0.2491, "step": 118 }, { "epoch": 2.016949152542373, "grad_norm": 0.4472827280319739, "learning_rate": 1.3588169737753258e-06, "loss": 0.1855, "step": 119 }, { "epoch": 2.0338983050847457, "grad_norm": 0.5413068643010572, "learning_rate": 1.3176675345338085e-06, "loss": 0.2188, "step": 120 }, { "epoch": 2.0508474576271185, "grad_norm": 0.428707546016933, "learning_rate": 1.276926653034444e-06, "loss": 0.2248, "step": 121 }, { "epoch": 2.0677966101694913, "grad_norm": 0.5858749588254089, "learning_rate": 1.2366084073838963e-06, "loss": 0.2598, "step": 122 }, { "epoch": 2.084745762711864, "grad_norm": 0.44846382451641165, "learning_rate": 1.1967267296460208e-06, "loss": 0.1837, "step": 123 }, { "epoch": 2.1016949152542375, "grad_norm": 0.48106733288396136, "learning_rate": 1.157295401027616e-06, "loss": 0.2265, "step": 124 }, { "epoch": 2.1186440677966103, "grad_norm": 0.5527311526391787, "learning_rate": 1.1183280471162916e-06, "loss": 0.2181, "step": 125 }, { "epoch": 2.135593220338983, "grad_norm": 0.4955616787413392, "learning_rate": 1.079838133172111e-06, "loss": 0.2234, "step": 126 }, { "epoch": 2.152542372881356, "grad_norm": 0.43338578262654337, "learning_rate": 1.0418389594746462e-06, "loss": 0.214, "step": 127 }, { "epoch": 2.169491525423729, "grad_norm": 0.3669554740937264, "learning_rate": 1.0043436567270313e-06, "loss": 0.1856, "step": 128 }, { "epoch": 2.1864406779661016, "grad_norm": 0.4089136976273204, "learning_rate": 9.673651815186186e-07, "loss": 0.181, "step": 129 }, { "epoch": 2.2033898305084745, "grad_norm": 0.5094068860196916, "learning_rate": 9.309163118477954e-07, "loss": 0.232, "step": 130 }, { "epoch": 2.2203389830508473, "grad_norm": 0.6111829159014325, "learning_rate": 8.950096427065232e-07, "loss": 0.256, "step": 131 }, { "epoch": 2.23728813559322, "grad_norm": 0.5499628769610874, "learning_rate": 8.596575817281036e-07, "loss": 0.263, "step": 132 }, { "epoch": 2.2542372881355934, "grad_norm": 0.472315352129982, "learning_rate": 8.248723448996942e-07, "loss": 0.2025, "step": 133 }, { "epoch": 2.2711864406779663, "grad_norm": 0.44621416461066526, "learning_rate": 7.906659523410445e-07, "loss": 0.1964, "step": 134 }, { "epoch": 2.288135593220339, "grad_norm": 0.5438276256045982, "learning_rate": 7.570502241509162e-07, "loss": 0.24, "step": 135 }, { "epoch": 2.305084745762712, "grad_norm": 0.48076560661191803, "learning_rate": 7.240367763226214e-07, "loss": 0.2522, "step": 136 }, { "epoch": 2.3220338983050848, "grad_norm": 0.45221392230794627, "learning_rate": 6.916370167300846e-07, "loss": 0.2031, "step": 137 }, { "epoch": 2.3389830508474576, "grad_norm": 0.6301591765021999, "learning_rate": 6.59862141185832e-07, "loss": 0.2158, "step": 138 }, { "epoch": 2.3559322033898304, "grad_norm": 0.3952157742860489, "learning_rate": 6.28723129572247e-07, "loss": 0.2078, "step": 139 }, { "epoch": 2.3728813559322033, "grad_norm": 0.446514287134317, "learning_rate": 5.982307420474501e-07, "loss": 0.1948, "step": 140 }, { "epoch": 2.389830508474576, "grad_norm": 0.5855904635323722, "learning_rate": 5.683955153270959e-07, "loss": 0.2657, "step": 141 }, { "epoch": 2.406779661016949, "grad_norm": 0.5583109191195978, "learning_rate": 5.39227759043392e-07, "loss": 0.1933, "step": 142 }, { "epoch": 2.423728813559322, "grad_norm": 0.47676799585910457, "learning_rate": 5.107375521825791e-07, "loss": 0.2273, "step": 143 }, { "epoch": 2.440677966101695, "grad_norm": 0.4864750560824795, "learning_rate": 4.829347396021142e-07, "loss": 0.251, "step": 144 }, { "epoch": 2.457627118644068, "grad_norm": 0.6512251016554255, "learning_rate": 4.5582892862875457e-07, "loss": 0.2521, "step": 145 }, { "epoch": 2.4745762711864407, "grad_norm": 0.48233660615594676, "learning_rate": 4.294294857387285e-07, "loss": 0.2275, "step": 146 }, { "epoch": 2.4915254237288136, "grad_norm": 0.6428476684215992, "learning_rate": 4.0374553332112374e-07, "loss": 0.2327, "step": 147 }, { "epoch": 2.5084745762711864, "grad_norm": 0.5076426403103419, "learning_rate": 3.787859465256258e-07, "loss": 0.1708, "step": 148 }, { "epoch": 2.5254237288135593, "grad_norm": 0.5026976859156602, "learning_rate": 3.545593501956901e-07, "loss": 0.213, "step": 149 }, { "epoch": 2.542372881355932, "grad_norm": 0.5662522579882163, "learning_rate": 3.3107411588820527e-07, "loss": 0.1996, "step": 150 }, { "epoch": 2.559322033898305, "grad_norm": 0.5195058601098169, "learning_rate": 3.083383589806846e-07, "loss": 0.2263, "step": 151 }, { "epoch": 2.576271186440678, "grad_norm": 0.5200135379990353, "learning_rate": 2.8635993586697555e-07, "loss": 0.2139, "step": 152 }, { "epoch": 2.593220338983051, "grad_norm": 0.4763652464214379, "learning_rate": 2.6514644124246675e-07, "loss": 0.2229, "step": 153 }, { "epoch": 2.610169491525424, "grad_norm": 0.49804739846772966, "learning_rate": 2.447052054797233e-07, "loss": 0.217, "step": 154 }, { "epoch": 2.6271186440677967, "grad_norm": 0.48757701271761317, "learning_rate": 2.2504329209545846e-07, "loss": 0.2081, "step": 155 }, { "epoch": 2.6440677966101696, "grad_norm": 0.4947797017705308, "learning_rate": 2.0616749530971785e-07, "loss": 0.2351, "step": 156 }, { "epoch": 2.6610169491525424, "grad_norm": 0.5222467856875551, "learning_rate": 1.8808433769812367e-07, "loss": 0.2205, "step": 157 }, { "epoch": 2.6779661016949152, "grad_norm": 0.4225419072915034, "learning_rate": 1.7080006793798176e-07, "loss": 0.1926, "step": 158 }, { "epoch": 2.694915254237288, "grad_norm": 0.5691086207780526, "learning_rate": 1.54320658649037e-07, "loss": 0.2317, "step": 159 }, { "epoch": 2.711864406779661, "grad_norm": 0.3840618595955483, "learning_rate": 1.3865180432961977e-07, "loss": 0.1861, "step": 160 }, { "epoch": 2.7288135593220337, "grad_norm": 0.43342821320697045, "learning_rate": 1.237989193889e-07, "loss": 0.1841, "step": 161 }, { "epoch": 2.7457627118644066, "grad_norm": 0.4541304378156735, "learning_rate": 1.0976713627592561e-07, "loss": 0.1838, "step": 162 }, { "epoch": 2.7627118644067794, "grad_norm": 0.47718407958651254, "learning_rate": 9.656130370609057e-08, "loss": 0.2156, "step": 163 }, { "epoch": 2.7796610169491527, "grad_norm": 0.4203326388891047, "learning_rate": 8.418598498565217e-08, "loss": 0.198, "step": 164 }, { "epoch": 2.7966101694915255, "grad_norm": 0.6344933451047076, "learning_rate": 7.264545643486997e-08, "loss": 0.2469, "step": 165 }, { "epoch": 2.8135593220338984, "grad_norm": 0.5726793839966674, "learning_rate": 6.194370591031174e-08, "loss": 0.2504, "step": 166 }, { "epoch": 2.830508474576271, "grad_norm": 0.44232201362023676, "learning_rate": 5.208443142684094e-08, "loss": 0.211, "step": 167 }, { "epoch": 2.847457627118644, "grad_norm": 0.5141286149933719, "learning_rate": 4.307103987976041e-08, "loss": 0.231, "step": 168 }, { "epoch": 2.864406779661017, "grad_norm": 0.4569119724260856, "learning_rate": 3.4906645867549547e-08, "loss": 0.1939, "step": 169 }, { "epoch": 2.8813559322033897, "grad_norm": 0.48342076668279055, "learning_rate": 2.7594070615609426e-08, "loss": 0.2043, "step": 170 }, { "epoch": 2.898305084745763, "grad_norm": 0.34035724769380715, "learning_rate": 2.1135841001380386e-08, "loss": 0.1772, "step": 171 }, { "epoch": 2.915254237288136, "grad_norm": 0.4608098604477918, "learning_rate": 1.55341886811744e-08, "loss": 0.1966, "step": 172 }, { "epoch": 2.9322033898305087, "grad_norm": 0.48086970561951203, "learning_rate": 1.0791049319021086e-08, "loss": 0.2193, "step": 173 }, { "epoch": 2.9491525423728815, "grad_norm": 0.5098124389058276, "learning_rate": 6.908061917794417e-09, "loss": 0.2126, "step": 174 }, { "epoch": 2.9661016949152543, "grad_norm": 0.4671835659759605, "learning_rate": 3.8865682528504975e-09, "loss": 0.2207, "step": 175 }, { "epoch": 2.983050847457627, "grad_norm": 1.3155876369197672, "learning_rate": 1.7276124083753788e-09, "loss": 0.2563, "step": 176 }, { "epoch": 3.0, "grad_norm": 0.5705208597149767, "learning_rate": 4.3194041659866405e-10, "loss": 0.2054, "step": 177 }, { "epoch": 3.0, "step": 177, "total_flos": 118937526042624.0, "train_loss": 0.25391180055626367, "train_runtime": 14992.2104, "train_samples_per_second": 0.188, "train_steps_per_second": 0.012 } ], "logging_steps": 1.0, "max_steps": 177, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 118937526042624.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }