| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 177, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01694915254237288, | |
| "grad_norm": 11.399982724065612, | |
| "learning_rate": 0.0, | |
| "loss": 0.5723, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03389830508474576, | |
| "grad_norm": 15.996953522086482, | |
| "learning_rate": 6.25e-07, | |
| "loss": 0.6919, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05084745762711865, | |
| "grad_norm": 12.017237167498154, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.5464, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.06779661016949153, | |
| "grad_norm": 9.524531515096024, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.4571, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0847457627118644, | |
| "grad_norm": 10.11840992474023, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.512, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1016949152542373, | |
| "grad_norm": 8.291307792118173, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.4744, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.11864406779661017, | |
| "grad_norm": 4.947811089030161, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.4064, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.13559322033898305, | |
| "grad_norm": 3.957348877914598, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.4147, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.15254237288135594, | |
| "grad_norm": 1.878974520193955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3625, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.1694915254237288, | |
| "grad_norm": 1.685893632116404, | |
| "learning_rate": 4.999568059583401e-06, | |
| "loss": 0.3652, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1864406779661017, | |
| "grad_norm": 0.8038396125458356, | |
| "learning_rate": 4.998272387591625e-06, | |
| "loss": 0.2205, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2033898305084746, | |
| "grad_norm": 1.1469245810216282, | |
| "learning_rate": 4.99611343174715e-06, | |
| "loss": 0.3542, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.22033898305084745, | |
| "grad_norm": 1.1786630363061648, | |
| "learning_rate": 4.993091938082206e-06, | |
| "loss": 0.3458, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.23728813559322035, | |
| "grad_norm": 1.1958089208385483, | |
| "learning_rate": 4.989208950680979e-06, | |
| "loss": 0.3565, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2542372881355932, | |
| "grad_norm": 0.9163324740544867, | |
| "learning_rate": 4.984465811318826e-06, | |
| "loss": 0.3015, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 0.7719232841825565, | |
| "learning_rate": 4.97886415899862e-06, | |
| "loss": 0.3175, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.288135593220339, | |
| "grad_norm": 0.7319081785361947, | |
| "learning_rate": 4.972405929384391e-06, | |
| "loss": 0.3195, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3050847457627119, | |
| "grad_norm": 0.7002551472674452, | |
| "learning_rate": 4.965093354132451e-06, | |
| "loss": 0.3074, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3220338983050847, | |
| "grad_norm": 0.6366390198837392, | |
| "learning_rate": 4.9569289601202405e-06, | |
| "loss": 0.287, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 0.48511268745049707, | |
| "learning_rate": 4.9479155685731595e-06, | |
| "loss": 0.2634, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3559322033898305, | |
| "grad_norm": 0.5725593300080952, | |
| "learning_rate": 4.938056294089689e-06, | |
| "loss": 0.2718, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3728813559322034, | |
| "grad_norm": 0.5881100232575105, | |
| "learning_rate": 4.927354543565131e-06, | |
| "loss": 0.2813, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.3898305084745763, | |
| "grad_norm": 0.6954472249924972, | |
| "learning_rate": 4.915814015014349e-06, | |
| "loss": 0.2829, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.4067796610169492, | |
| "grad_norm": 1.7773763839304515, | |
| "learning_rate": 4.90343869629391e-06, | |
| "loss": 0.2961, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.423728813559322, | |
| "grad_norm": 0.7136095313835628, | |
| "learning_rate": 4.890232863724075e-06, | |
| "loss": 0.314, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4406779661016949, | |
| "grad_norm": 0.5965674482281343, | |
| "learning_rate": 4.8762010806111e-06, | |
| "loss": 0.305, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.4576271186440678, | |
| "grad_norm": 0.5426262579836809, | |
| "learning_rate": 4.861348195670381e-06, | |
| "loss": 0.2904, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.4745762711864407, | |
| "grad_norm": 0.6232000750718879, | |
| "learning_rate": 4.845679341350963e-06, | |
| "loss": 0.3194, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.4915254237288136, | |
| "grad_norm": 0.5521136750290783, | |
| "learning_rate": 4.8291999320620185e-06, | |
| "loss": 0.3016, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5084745762711864, | |
| "grad_norm": 0.5746062885940153, | |
| "learning_rate": 4.811915662301877e-06, | |
| "loss": 0.2648, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5254237288135594, | |
| "grad_norm": 0.607102290977902, | |
| "learning_rate": 4.793832504690283e-06, | |
| "loss": 0.299, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 0.49962250759355686, | |
| "learning_rate": 4.774956707904542e-06, | |
| "loss": 0.2515, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.559322033898305, | |
| "grad_norm": 0.4701577356569123, | |
| "learning_rate": 4.755294794520277e-06, | |
| "loss": 0.2491, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.576271186440678, | |
| "grad_norm": 0.7858423854608445, | |
| "learning_rate": 4.734853558757534e-06, | |
| "loss": 0.2827, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5932203389830508, | |
| "grad_norm": 0.49539465146154066, | |
| "learning_rate": 4.7136400641330245e-06, | |
| "loss": 0.2844, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.6101694915254238, | |
| "grad_norm": 0.4775787849669313, | |
| "learning_rate": 4.691661641019316e-06, | |
| "loss": 0.2587, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6271186440677966, | |
| "grad_norm": 0.6390518183116327, | |
| "learning_rate": 4.6689258841117946e-06, | |
| "loss": 0.2926, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6440677966101694, | |
| "grad_norm": 0.5314073848549145, | |
| "learning_rate": 4.64544064980431e-06, | |
| "loss": 0.2485, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6610169491525424, | |
| "grad_norm": 0.4020113800366879, | |
| "learning_rate": 4.621214053474374e-06, | |
| "loss": 0.2132, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 0.5353215315551882, | |
| "learning_rate": 4.596254466678877e-06, | |
| "loss": 0.2788, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6949152542372882, | |
| "grad_norm": 0.4769490074416754, | |
| "learning_rate": 4.570570514261272e-06, | |
| "loss": 0.2703, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.711864406779661, | |
| "grad_norm": 0.5249549778546513, | |
| "learning_rate": 4.544171071371246e-06, | |
| "loss": 0.2701, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7288135593220338, | |
| "grad_norm": 0.5163626189073403, | |
| "learning_rate": 4.517065260397887e-06, | |
| "loss": 0.2864, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7457627118644068, | |
| "grad_norm": 0.5413634640003411, | |
| "learning_rate": 4.489262447817421e-06, | |
| "loss": 0.2568, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7627118644067796, | |
| "grad_norm": 0.5149640991984162, | |
| "learning_rate": 4.460772240956609e-06, | |
| "loss": 0.2525, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.7796610169491526, | |
| "grad_norm": 0.445354494671823, | |
| "learning_rate": 4.431604484672905e-06, | |
| "loss": 0.2172, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.7966101694915254, | |
| "grad_norm": 0.5009716150531464, | |
| "learning_rate": 4.401769257952551e-06, | |
| "loss": 0.2624, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 0.5140310721121815, | |
| "learning_rate": 4.3712768704277535e-06, | |
| "loss": 0.2781, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8305084745762712, | |
| "grad_norm": 0.5467147233990847, | |
| "learning_rate": 4.340137858814168e-06, | |
| "loss": 0.244, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.847457627118644, | |
| "grad_norm": 0.4862542975816897, | |
| "learning_rate": 4.308362983269916e-06, | |
| "loss": 0.226, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.864406779661017, | |
| "grad_norm": 0.5701344768833954, | |
| "learning_rate": 4.275963223677379e-06, | |
| "loss": 0.3184, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.8813559322033898, | |
| "grad_norm": 0.5680722480199585, | |
| "learning_rate": 4.242949775849083e-06, | |
| "loss": 0.2723, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.8983050847457628, | |
| "grad_norm": 0.6081264789001348, | |
| "learning_rate": 4.209334047658956e-06, | |
| "loss": 0.2686, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.9152542372881356, | |
| "grad_norm": 0.46308625878084975, | |
| "learning_rate": 4.175127655100306e-06, | |
| "loss": 0.2763, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9322033898305084, | |
| "grad_norm": 0.41899355255330367, | |
| "learning_rate": 4.140342418271897e-06, | |
| "loss": 0.2555, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9491525423728814, | |
| "grad_norm": 0.5677964936366232, | |
| "learning_rate": 4.104990357293478e-06, | |
| "loss": 0.2808, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.9661016949152542, | |
| "grad_norm": 0.5450969329455534, | |
| "learning_rate": 4.069083688152206e-06, | |
| "loss": 0.2464, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.9830508474576272, | |
| "grad_norm": 0.49742989815197775, | |
| "learning_rate": 4.032634818481382e-06, | |
| "loss": 0.2435, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.47791981816360296, | |
| "learning_rate": 3.995656343272969e-06, | |
| "loss": 0.2431, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.0169491525423728, | |
| "grad_norm": 0.5336552382444577, | |
| "learning_rate": 3.958161040525354e-06, | |
| "loss": 0.2608, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0338983050847457, | |
| "grad_norm": 0.6255845843539747, | |
| "learning_rate": 3.92016186682789e-06, | |
| "loss": 0.2718, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.0508474576271187, | |
| "grad_norm": 0.5386071690484918, | |
| "learning_rate": 3.88167195288371e-06, | |
| "loss": 0.2472, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.0677966101694916, | |
| "grad_norm": 0.4610208030538615, | |
| "learning_rate": 3.842704598972384e-06, | |
| "loss": 0.2711, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.0847457627118644, | |
| "grad_norm": 0.5005083514691049, | |
| "learning_rate": 3.80327327035398e-06, | |
| "loss": 0.243, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.1016949152542372, | |
| "grad_norm": 0.5256738643041071, | |
| "learning_rate": 3.763391592616104e-06, | |
| "loss": 0.2304, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.11864406779661, | |
| "grad_norm": 0.4461147519625525, | |
| "learning_rate": 3.7230733469655554e-06, | |
| "loss": 0.257, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.1355932203389831, | |
| "grad_norm": 0.5616719506601695, | |
| "learning_rate": 3.6823324654661923e-06, | |
| "loss": 0.2695, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.152542372881356, | |
| "grad_norm": 0.5267858101125528, | |
| "learning_rate": 3.6411830262246755e-06, | |
| "loss": 0.2626, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.1694915254237288, | |
| "grad_norm": 0.44572869985154356, | |
| "learning_rate": 3.599639248525749e-06, | |
| "loss": 0.2314, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.1864406779661016, | |
| "grad_norm": 0.4743820453627211, | |
| "learning_rate": 3.5577154879187286e-06, | |
| "loss": 0.2524, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2033898305084745, | |
| "grad_norm": 0.4795920562830289, | |
| "learning_rate": 3.5154262312569134e-06, | |
| "loss": 0.2272, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.2203389830508475, | |
| "grad_norm": 0.4454617572461806, | |
| "learning_rate": 3.4727860916916143e-06, | |
| "loss": 0.2192, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.2372881355932204, | |
| "grad_norm": 0.45505003053894033, | |
| "learning_rate": 3.429809803622551e-06, | |
| "loss": 0.2222, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2542372881355932, | |
| "grad_norm": 0.5092981797760533, | |
| "learning_rate": 3.386512217606339e-06, | |
| "loss": 0.2221, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.271186440677966, | |
| "grad_norm": 0.45652766088591107, | |
| "learning_rate": 3.342908295224854e-06, | |
| "loss": 0.2115, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.288135593220339, | |
| "grad_norm": 0.5773859878970916, | |
| "learning_rate": 3.299013103915214e-06, | |
| "loss": 0.2735, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.305084745762712, | |
| "grad_norm": 0.5403811470495191, | |
| "learning_rate": 3.2548418117631952e-06, | |
| "loss": 0.2248, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.3220338983050848, | |
| "grad_norm": 0.6646747594835315, | |
| "learning_rate": 3.2104096822618657e-06, | |
| "loss": 0.2554, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.3389830508474576, | |
| "grad_norm": 0.44285218067378557, | |
| "learning_rate": 3.1657320690372464e-06, | |
| "loss": 0.1984, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.3559322033898304, | |
| "grad_norm": 0.48055286049444484, | |
| "learning_rate": 3.120824410542833e-06, | |
| "loss": 0.2187, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.3728813559322033, | |
| "grad_norm": 0.5320390743973299, | |
| "learning_rate": 3.0757022247248e-06, | |
| "loss": 0.2232, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.3898305084745763, | |
| "grad_norm": 0.4807624431716425, | |
| "learning_rate": 3.0303811036597395e-06, | |
| "loss": 0.2403, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.4067796610169492, | |
| "grad_norm": 0.4244464101592162, | |
| "learning_rate": 2.9848767081667823e-06, | |
| "loss": 0.2213, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.423728813559322, | |
| "grad_norm": 0.49236319422304126, | |
| "learning_rate": 2.9392047623959653e-06, | |
| "loss": 0.2375, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.4406779661016949, | |
| "grad_norm": 0.46784645507056544, | |
| "learning_rate": 2.8933810483947156e-06, | |
| "loss": 0.2219, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.457627118644068, | |
| "grad_norm": 0.48973007159992155, | |
| "learning_rate": 2.8474214006543255e-06, | |
| "loss": 0.2166, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.4745762711864407, | |
| "grad_norm": 0.3510113111902409, | |
| "learning_rate": 2.8013417006383078e-06, | |
| "loss": 0.1943, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.4915254237288136, | |
| "grad_norm": 0.44578327104858334, | |
| "learning_rate": 2.755157871294521e-06, | |
| "loss": 0.234, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.5084745762711864, | |
| "grad_norm": 0.7165659629002311, | |
| "learning_rate": 2.708885871552954e-06, | |
| "loss": 0.2447, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.5254237288135593, | |
| "grad_norm": 0.45065744754362286, | |
| "learning_rate": 2.6625416908110825e-06, | |
| "loss": 0.2284, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.542372881355932, | |
| "grad_norm": 0.41704260341071947, | |
| "learning_rate": 2.616141343408696e-06, | |
| "loss": 0.2237, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.559322033898305, | |
| "grad_norm": 0.5042304558688149, | |
| "learning_rate": 2.569700863094104e-06, | |
| "loss": 0.2219, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.576271186440678, | |
| "grad_norm": 0.5190696344795624, | |
| "learning_rate": 2.5232362974836394e-06, | |
| "loss": 0.2321, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.5932203389830508, | |
| "grad_norm": 0.5279441416014571, | |
| "learning_rate": 2.4767637025163614e-06, | |
| "loss": 0.2715, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.6101694915254239, | |
| "grad_norm": 0.5147780575565639, | |
| "learning_rate": 2.4302991369058963e-06, | |
| "loss": 0.2445, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.6271186440677967, | |
| "grad_norm": 0.4916874716282566, | |
| "learning_rate": 2.3838586565913053e-06, | |
| "loss": 0.2045, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.6440677966101696, | |
| "grad_norm": 0.4829870763326134, | |
| "learning_rate": 2.3374583091889188e-06, | |
| "loss": 0.2154, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.6610169491525424, | |
| "grad_norm": 0.5093333087244544, | |
| "learning_rate": 2.2911141284470466e-06, | |
| "loss": 0.2453, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.6779661016949152, | |
| "grad_norm": 0.6588755855029546, | |
| "learning_rate": 2.2448421287054794e-06, | |
| "loss": 0.2668, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.694915254237288, | |
| "grad_norm": 0.49865992979225093, | |
| "learning_rate": 2.1986582993616926e-06, | |
| "loss": 0.2283, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.711864406779661, | |
| "grad_norm": 0.37695864859441414, | |
| "learning_rate": 2.1525785993456753e-06, | |
| "loss": 0.2057, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.7288135593220337, | |
| "grad_norm": 0.548172589959634, | |
| "learning_rate": 2.1066189516052848e-06, | |
| "loss": 0.2558, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.7457627118644068, | |
| "grad_norm": 0.7296309615305285, | |
| "learning_rate": 2.0607952376040355e-06, | |
| "loss": 0.2707, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.7627118644067796, | |
| "grad_norm": 0.6191305523245286, | |
| "learning_rate": 2.0151232918332186e-06, | |
| "loss": 0.216, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.7796610169491527, | |
| "grad_norm": 0.4275148733512332, | |
| "learning_rate": 1.9696188963402613e-06, | |
| "loss": 0.2016, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.7966101694915255, | |
| "grad_norm": 0.5433825312095228, | |
| "learning_rate": 1.9242977752752006e-06, | |
| "loss": 0.2463, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.8135593220338984, | |
| "grad_norm": 0.6146764112328339, | |
| "learning_rate": 1.879175589457168e-06, | |
| "loss": 0.217, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.8305084745762712, | |
| "grad_norm": 0.4492249547863865, | |
| "learning_rate": 1.8342679309627545e-06, | |
| "loss": 0.2112, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.847457627118644, | |
| "grad_norm": 0.45112403138122764, | |
| "learning_rate": 1.7895903177381351e-06, | |
| "loss": 0.2213, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.8644067796610169, | |
| "grad_norm": 0.6114123860260355, | |
| "learning_rate": 1.7451581882368052e-06, | |
| "loss": 0.2067, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.8813559322033897, | |
| "grad_norm": 0.47090009128241844, | |
| "learning_rate": 1.700986896084787e-06, | |
| "loss": 0.23, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.8983050847457628, | |
| "grad_norm": 0.4641120289884631, | |
| "learning_rate": 1.6570917047751465e-06, | |
| "loss": 0.2385, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.9152542372881356, | |
| "grad_norm": 0.6393272872227032, | |
| "learning_rate": 1.613487782393661e-06, | |
| "loss": 0.2196, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.9322033898305084, | |
| "grad_norm": 0.41826816207678297, | |
| "learning_rate": 1.5701901963774504e-06, | |
| "loss": 0.2031, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.9491525423728815, | |
| "grad_norm": 0.43623407997620883, | |
| "learning_rate": 1.5272139083083865e-06, | |
| "loss": 0.2168, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.9661016949152543, | |
| "grad_norm": 0.5121666691791417, | |
| "learning_rate": 1.4845737687430875e-06, | |
| "loss": 0.2476, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.9830508474576272, | |
| "grad_norm": 0.4875537170599579, | |
| "learning_rate": 1.4422845120812718e-06, | |
| "loss": 0.1885, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6170831997594528, | |
| "learning_rate": 1.400360751474253e-06, | |
| "loss": 0.2491, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.016949152542373, | |
| "grad_norm": 0.4472827280319739, | |
| "learning_rate": 1.3588169737753258e-06, | |
| "loss": 0.1855, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.0338983050847457, | |
| "grad_norm": 0.5413068643010572, | |
| "learning_rate": 1.3176675345338085e-06, | |
| "loss": 0.2188, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.0508474576271185, | |
| "grad_norm": 0.428707546016933, | |
| "learning_rate": 1.276926653034444e-06, | |
| "loss": 0.2248, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.0677966101694913, | |
| "grad_norm": 0.5858749588254089, | |
| "learning_rate": 1.2366084073838963e-06, | |
| "loss": 0.2598, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.084745762711864, | |
| "grad_norm": 0.44846382451641165, | |
| "learning_rate": 1.1967267296460208e-06, | |
| "loss": 0.1837, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.1016949152542375, | |
| "grad_norm": 0.48106733288396136, | |
| "learning_rate": 1.157295401027616e-06, | |
| "loss": 0.2265, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.1186440677966103, | |
| "grad_norm": 0.5527311526391787, | |
| "learning_rate": 1.1183280471162916e-06, | |
| "loss": 0.2181, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.135593220338983, | |
| "grad_norm": 0.4955616787413392, | |
| "learning_rate": 1.079838133172111e-06, | |
| "loss": 0.2234, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.152542372881356, | |
| "grad_norm": 0.43338578262654337, | |
| "learning_rate": 1.0418389594746462e-06, | |
| "loss": 0.214, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.169491525423729, | |
| "grad_norm": 0.3669554740937264, | |
| "learning_rate": 1.0043436567270313e-06, | |
| "loss": 0.1856, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.1864406779661016, | |
| "grad_norm": 0.4089136976273204, | |
| "learning_rate": 9.673651815186186e-07, | |
| "loss": 0.181, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.2033898305084745, | |
| "grad_norm": 0.5094068860196916, | |
| "learning_rate": 9.309163118477954e-07, | |
| "loss": 0.232, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2203389830508473, | |
| "grad_norm": 0.6111829159014325, | |
| "learning_rate": 8.950096427065232e-07, | |
| "loss": 0.256, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.23728813559322, | |
| "grad_norm": 0.5499628769610874, | |
| "learning_rate": 8.596575817281036e-07, | |
| "loss": 0.263, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.2542372881355934, | |
| "grad_norm": 0.472315352129982, | |
| "learning_rate": 8.248723448996942e-07, | |
| "loss": 0.2025, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.2711864406779663, | |
| "grad_norm": 0.44621416461066526, | |
| "learning_rate": 7.906659523410445e-07, | |
| "loss": 0.1964, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.288135593220339, | |
| "grad_norm": 0.5438276256045982, | |
| "learning_rate": 7.570502241509162e-07, | |
| "loss": 0.24, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.305084745762712, | |
| "grad_norm": 0.48076560661191803, | |
| "learning_rate": 7.240367763226214e-07, | |
| "loss": 0.2522, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.3220338983050848, | |
| "grad_norm": 0.45221392230794627, | |
| "learning_rate": 6.916370167300846e-07, | |
| "loss": 0.2031, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.3389830508474576, | |
| "grad_norm": 0.6301591765021999, | |
| "learning_rate": 6.59862141185832e-07, | |
| "loss": 0.2158, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.3559322033898304, | |
| "grad_norm": 0.3952157742860489, | |
| "learning_rate": 6.28723129572247e-07, | |
| "loss": 0.2078, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.3728813559322033, | |
| "grad_norm": 0.446514287134317, | |
| "learning_rate": 5.982307420474501e-07, | |
| "loss": 0.1948, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.389830508474576, | |
| "grad_norm": 0.5855904635323722, | |
| "learning_rate": 5.683955153270959e-07, | |
| "loss": 0.2657, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.406779661016949, | |
| "grad_norm": 0.5583109191195978, | |
| "learning_rate": 5.39227759043392e-07, | |
| "loss": 0.1933, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.423728813559322, | |
| "grad_norm": 0.47676799585910457, | |
| "learning_rate": 5.107375521825791e-07, | |
| "loss": 0.2273, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.440677966101695, | |
| "grad_norm": 0.4864750560824795, | |
| "learning_rate": 4.829347396021142e-07, | |
| "loss": 0.251, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.457627118644068, | |
| "grad_norm": 0.6512251016554255, | |
| "learning_rate": 4.5582892862875457e-07, | |
| "loss": 0.2521, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.4745762711864407, | |
| "grad_norm": 0.48233660615594676, | |
| "learning_rate": 4.294294857387285e-07, | |
| "loss": 0.2275, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.4915254237288136, | |
| "grad_norm": 0.6428476684215992, | |
| "learning_rate": 4.0374553332112374e-07, | |
| "loss": 0.2327, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.5084745762711864, | |
| "grad_norm": 0.5076426403103419, | |
| "learning_rate": 3.787859465256258e-07, | |
| "loss": 0.1708, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.5254237288135593, | |
| "grad_norm": 0.5026976859156602, | |
| "learning_rate": 3.545593501956901e-07, | |
| "loss": 0.213, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.542372881355932, | |
| "grad_norm": 0.5662522579882163, | |
| "learning_rate": 3.3107411588820527e-07, | |
| "loss": 0.1996, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.559322033898305, | |
| "grad_norm": 0.5195058601098169, | |
| "learning_rate": 3.083383589806846e-07, | |
| "loss": 0.2263, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.576271186440678, | |
| "grad_norm": 0.5200135379990353, | |
| "learning_rate": 2.8635993586697555e-07, | |
| "loss": 0.2139, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.593220338983051, | |
| "grad_norm": 0.4763652464214379, | |
| "learning_rate": 2.6514644124246675e-07, | |
| "loss": 0.2229, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.610169491525424, | |
| "grad_norm": 0.49804739846772966, | |
| "learning_rate": 2.447052054797233e-07, | |
| "loss": 0.217, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.6271186440677967, | |
| "grad_norm": 0.48757701271761317, | |
| "learning_rate": 2.2504329209545846e-07, | |
| "loss": 0.2081, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.6440677966101696, | |
| "grad_norm": 0.4947797017705308, | |
| "learning_rate": 2.0616749530971785e-07, | |
| "loss": 0.2351, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.6610169491525424, | |
| "grad_norm": 0.5222467856875551, | |
| "learning_rate": 1.8808433769812367e-07, | |
| "loss": 0.2205, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.6779661016949152, | |
| "grad_norm": 0.4225419072915034, | |
| "learning_rate": 1.7080006793798176e-07, | |
| "loss": 0.1926, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.694915254237288, | |
| "grad_norm": 0.5691086207780526, | |
| "learning_rate": 1.54320658649037e-07, | |
| "loss": 0.2317, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.711864406779661, | |
| "grad_norm": 0.3840618595955483, | |
| "learning_rate": 1.3865180432961977e-07, | |
| "loss": 0.1861, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.7288135593220337, | |
| "grad_norm": 0.43342821320697045, | |
| "learning_rate": 1.237989193889e-07, | |
| "loss": 0.1841, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.7457627118644066, | |
| "grad_norm": 0.4541304378156735, | |
| "learning_rate": 1.0976713627592561e-07, | |
| "loss": 0.1838, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.7627118644067794, | |
| "grad_norm": 0.47718407958651254, | |
| "learning_rate": 9.656130370609057e-08, | |
| "loss": 0.2156, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.7796610169491527, | |
| "grad_norm": 0.4203326388891047, | |
| "learning_rate": 8.418598498565217e-08, | |
| "loss": 0.198, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.7966101694915255, | |
| "grad_norm": 0.6344933451047076, | |
| "learning_rate": 7.264545643486997e-08, | |
| "loss": 0.2469, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.8135593220338984, | |
| "grad_norm": 0.5726793839966674, | |
| "learning_rate": 6.194370591031174e-08, | |
| "loss": 0.2504, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.830508474576271, | |
| "grad_norm": 0.44232201362023676, | |
| "learning_rate": 5.208443142684094e-08, | |
| "loss": 0.211, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.847457627118644, | |
| "grad_norm": 0.5141286149933719, | |
| "learning_rate": 4.307103987976041e-08, | |
| "loss": 0.231, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.864406779661017, | |
| "grad_norm": 0.4569119724260856, | |
| "learning_rate": 3.4906645867549547e-08, | |
| "loss": 0.1939, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.8813559322033897, | |
| "grad_norm": 0.48342076668279055, | |
| "learning_rate": 2.7594070615609426e-08, | |
| "loss": 0.2043, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.898305084745763, | |
| "grad_norm": 0.34035724769380715, | |
| "learning_rate": 2.1135841001380386e-08, | |
| "loss": 0.1772, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.915254237288136, | |
| "grad_norm": 0.4608098604477918, | |
| "learning_rate": 1.55341886811744e-08, | |
| "loss": 0.1966, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.9322033898305087, | |
| "grad_norm": 0.48086970561951203, | |
| "learning_rate": 1.0791049319021086e-08, | |
| "loss": 0.2193, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.9491525423728815, | |
| "grad_norm": 0.5098124389058276, | |
| "learning_rate": 6.908061917794417e-09, | |
| "loss": 0.2126, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.9661016949152543, | |
| "grad_norm": 0.4671835659759605, | |
| "learning_rate": 3.8865682528504975e-09, | |
| "loss": 0.2207, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.983050847457627, | |
| "grad_norm": 1.3155876369197672, | |
| "learning_rate": 1.7276124083753788e-09, | |
| "loss": 0.2563, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.5705208597149767, | |
| "learning_rate": 4.3194041659866405e-10, | |
| "loss": 0.2054, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 177, | |
| "total_flos": 118937526042624.0, | |
| "train_loss": 0.25391180055626367, | |
| "train_runtime": 14992.2104, | |
| "train_samples_per_second": 0.188, | |
| "train_steps_per_second": 0.012 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 177, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 118937526042624.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |