Science-Expert-R1 / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
35498a0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9714285714285715,
"eval_steps": 500,
"global_step": 104,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01904761904761905,
"grad_norm": 1710495.2036469786,
"learning_rate": 0.0,
"loss": 1.3566,
"memory/device_reserved (GiB)": 126.71,
"memory/max_active (GiB)": 124.13,
"memory/max_allocated (GiB)": 122.77,
"step": 1,
"tokens_per_second_per_gpu": 3497.88
},
{
"epoch": 0.0380952380952381,
"grad_norm": 1558119.299961758,
"learning_rate": 8e-07,
"loss": 1.3538,
"memory/device_reserved (GiB)": 126.73,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 2,
"tokens_per_second_per_gpu": 3711.34
},
{
"epoch": 0.05714285714285714,
"grad_norm": 4186.518498313145,
"learning_rate": 1.6e-06,
"loss": 1.3529,
"memory/device_reserved (GiB)": 126.73,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 3,
"tokens_per_second_per_gpu": 3869.09
},
{
"epoch": 0.0761904761904762,
"grad_norm": 1230.1393406412694,
"learning_rate": 2.4e-06,
"loss": 1.3622,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.96,
"memory/max_allocated (GiB)": 122.82,
"step": 4,
"tokens_per_second_per_gpu": 3629.53
},
{
"epoch": 0.09523809523809523,
"grad_norm": 1035.6723923215748,
"learning_rate": 3.2e-06,
"loss": 1.3487,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 5,
"tokens_per_second_per_gpu": 3634.6
},
{
"epoch": 0.11428571428571428,
"grad_norm": 432.5460621726683,
"learning_rate": 4e-06,
"loss": 1.3432,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 6,
"tokens_per_second_per_gpu": 3776.49
},
{
"epoch": 0.13333333333333333,
"grad_norm": 1638.0331848931094,
"learning_rate": 4.8e-06,
"loss": 1.3677,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 7,
"tokens_per_second_per_gpu": 3647.88
},
{
"epoch": 0.1523809523809524,
"grad_norm": 2883.046779503214,
"learning_rate": 5.6e-06,
"loss": 1.3444,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 8,
"tokens_per_second_per_gpu": 3677.14
},
{
"epoch": 0.17142857142857143,
"grad_norm": 478.1216745871938,
"learning_rate": 6.4e-06,
"loss": 1.3305,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 9,
"tokens_per_second_per_gpu": 3739.63
},
{
"epoch": 0.19047619047619047,
"grad_norm": 1025.7505155071237,
"learning_rate": 7.2e-06,
"loss": 1.3362,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 10,
"tokens_per_second_per_gpu": 3730.56
},
{
"epoch": 0.20952380952380953,
"grad_norm": 1209.6274892436668,
"learning_rate": 8e-06,
"loss": 1.3325,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 11,
"tokens_per_second_per_gpu": 3661.21
},
{
"epoch": 0.22857142857142856,
"grad_norm": 1213.936189837833,
"learning_rate": 7.997766254921018e-06,
"loss": 1.3575,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 12,
"tokens_per_second_per_gpu": 3716.18
},
{
"epoch": 0.24761904761904763,
"grad_norm": 942.8786617202861,
"learning_rate": 7.991067514492613e-06,
"loss": 1.3145,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 13,
"tokens_per_second_per_gpu": 3587.32
},
{
"epoch": 0.26666666666666666,
"grad_norm": 3603.68277269405,
"learning_rate": 7.979911260354016e-06,
"loss": 1.3402,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 14,
"tokens_per_second_per_gpu": 3712.99
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2487.402838754216,
"learning_rate": 7.96430995261912e-06,
"loss": 1.2956,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 15,
"tokens_per_second_per_gpu": 3762.61
},
{
"epoch": 0.3047619047619048,
"grad_norm": 667.5903281250161,
"learning_rate": 7.944281015960114e-06,
"loss": 1.2992,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 16,
"tokens_per_second_per_gpu": 3358.5
},
{
"epoch": 0.3238095238095238,
"grad_norm": 167.9027323688511,
"learning_rate": 7.919846820146347e-06,
"loss": 1.3119,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 17,
"tokens_per_second_per_gpu": 3675.6
},
{
"epoch": 0.34285714285714286,
"grad_norm": 47.46189855084341,
"learning_rate": 7.891034655060149e-06,
"loss": 1.302,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 18,
"tokens_per_second_per_gpu": 3754.58
},
{
"epoch": 0.3619047619047619,
"grad_norm": 115.37054783431222,
"learning_rate": 7.857876700217507e-06,
"loss": 1.3066,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 19,
"tokens_per_second_per_gpu": 3763.17
},
{
"epoch": 0.38095238095238093,
"grad_norm": 72.03472195336599,
"learning_rate": 7.820409988827649e-06,
"loss": 1.2876,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 20,
"tokens_per_second_per_gpu": 3750.15
},
{
"epoch": 0.4,
"grad_norm": 123.88987560365385,
"learning_rate": 7.778676366431674e-06,
"loss": 1.2854,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 21,
"tokens_per_second_per_gpu": 3556.91
},
{
"epoch": 0.41904761904761906,
"grad_norm": 36.51030416393311,
"learning_rate": 7.73272244416641e-06,
"loss": 1.2799,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 22,
"tokens_per_second_per_gpu": 3627.03
},
{
"epoch": 0.4380952380952381,
"grad_norm": 37.445205147197846,
"learning_rate": 7.682599546705715e-06,
"loss": 1.2835,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 23,
"tokens_per_second_per_gpu": 3604.91
},
{
"epoch": 0.45714285714285713,
"grad_norm": 39.93974794828826,
"learning_rate": 7.628363654937363e-06,
"loss": 1.2947,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 24,
"tokens_per_second_per_gpu": 3782.86
},
{
"epoch": 0.47619047619047616,
"grad_norm": 59.41355630536809,
"learning_rate": 7.570075343439524e-06,
"loss": 1.2702,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 25,
"tokens_per_second_per_gpu": 3694.52
},
{
"epoch": 0.49523809523809526,
"grad_norm": 34.32373819297229,
"learning_rate": 7.507799712826686e-06,
"loss": 1.2984,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 26,
"tokens_per_second_per_gpu": 3613.01
},
{
"epoch": 0.5142857142857142,
"grad_norm": 21.68779916764309,
"learning_rate": 7.441606317040558e-06,
"loss": 1.2827,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 27,
"tokens_per_second_per_gpu": 3616.18
},
{
"epoch": 0.5333333333333333,
"grad_norm": 30.472648556953168,
"learning_rate": 7.371569085667188e-06,
"loss": 1.2801,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 28,
"tokens_per_second_per_gpu": 3754.99
},
{
"epoch": 0.5523809523809524,
"grad_norm": 19.319274693345776,
"learning_rate": 7.297766241367041e-06,
"loss": 1.2693,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 29,
"tokens_per_second_per_gpu": 3677.68
},
{
"epoch": 0.5714285714285714,
"grad_norm": 34.31430237097932,
"learning_rate": 7.220280212510252e-06,
"loss": 1.2581,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 30,
"tokens_per_second_per_gpu": 3730.31
},
{
"epoch": 0.5904761904761905,
"grad_norm": 82.8518096206661,
"learning_rate": 7.139197541114644e-06,
"loss": 1.2687,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 31,
"tokens_per_second_per_gpu": 3650.37
},
{
"epoch": 0.6095238095238096,
"grad_norm": 36.99675013730897,
"learning_rate": 7.0546087861893285e-06,
"loss": 1.2809,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 32,
"tokens_per_second_per_gpu": 3785.35
},
{
"epoch": 0.6285714285714286,
"grad_norm": 10.853195813384238,
"learning_rate": 6.96660842259183e-06,
"loss": 1.253,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 33,
"tokens_per_second_per_gpu": 3666.64
},
{
"epoch": 0.6476190476190476,
"grad_norm": 27.05353511161411,
"learning_rate": 6.875294735511717e-06,
"loss": 1.2601,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 34,
"tokens_per_second_per_gpu": 3808.86
},
{
"epoch": 0.6666666666666666,
"grad_norm": 11.079685605370564,
"learning_rate": 6.780769710698569e-06,
"loss": 1.2539,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 35,
"tokens_per_second_per_gpu": 3708.96
},
{
"epoch": 0.6857142857142857,
"grad_norm": 35.34021537624741,
"learning_rate": 6.683138920556894e-06,
"loss": 1.2362,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 36,
"tokens_per_second_per_gpu": 3819.32
},
{
"epoch": 0.7047619047619048,
"grad_norm": 47.246402607795154,
"learning_rate": 6.582511406235209e-06,
"loss": 1.2429,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 37,
"tokens_per_second_per_gpu": 3762.22
},
{
"epoch": 0.7238095238095238,
"grad_norm": 35.65219209343969,
"learning_rate": 6.4789995558409795e-06,
"loss": 1.2535,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 38,
"tokens_per_second_per_gpu": 3496.79
},
{
"epoch": 0.7428571428571429,
"grad_norm": 13.147263166038922,
"learning_rate": 6.3727189789174205e-06,
"loss": 1.2421,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 39,
"tokens_per_second_per_gpu": 3471.55
},
{
"epoch": 0.7619047619047619,
"grad_norm": 8.92693366901581,
"learning_rate": 6.263788377322381e-06,
"loss": 1.2587,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 40,
"tokens_per_second_per_gpu": 3700.61
},
{
"epoch": 0.780952380952381,
"grad_norm": 25.621463437533773,
"learning_rate": 6.152329412653491e-06,
"loss": 1.2535,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 41,
"tokens_per_second_per_gpu": 3696.17
},
{
"epoch": 0.8,
"grad_norm": 21.356947105637357,
"learning_rate": 6.038466570367669e-06,
"loss": 1.2437,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 42,
"tokens_per_second_per_gpu": 3679.52
},
{
"epoch": 0.819047619047619,
"grad_norm": 21.528748134497796,
"learning_rate": 5.922327020746735e-06,
"loss": 1.2243,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 43,
"tokens_per_second_per_gpu": 3654.06
},
{
"epoch": 0.8380952380952381,
"grad_norm": 14.734257530424147,
"learning_rate": 5.804040476864407e-06,
"loss": 1.2326,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 44,
"tokens_per_second_per_gpu": 3581.66
},
{
"epoch": 0.8571428571428571,
"grad_norm": 13.129280834101875,
"learning_rate": 5.68373904971334e-06,
"loss": 1.2442,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 45,
"tokens_per_second_per_gpu": 3788.2
},
{
"epoch": 0.8761904761904762,
"grad_norm": 14.976302382446457,
"learning_rate": 5.561557100653979e-06,
"loss": 1.2486,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 46,
"tokens_per_second_per_gpu": 3636.88
},
{
"epoch": 0.8952380952380953,
"grad_norm": 15.967232506668388,
"learning_rate": 5.43763109135005e-06,
"loss": 1.2338,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 47,
"tokens_per_second_per_gpu": 3759.31
},
{
"epoch": 0.9142857142857143,
"grad_norm": 16.354797247719976,
"learning_rate": 5.312099431358276e-06,
"loss": 1.2413,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 48,
"tokens_per_second_per_gpu": 3663.89
},
{
"epoch": 0.9333333333333333,
"grad_norm": 6.665663198954394,
"learning_rate": 5.185102323542536e-06,
"loss": 1.2395,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 49,
"tokens_per_second_per_gpu": 3727.2
},
{
"epoch": 0.9523809523809523,
"grad_norm": 9.1334624753648,
"learning_rate": 5.056781607485144e-06,
"loss": 1.2268,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 50,
"tokens_per_second_per_gpu": 3870.66
},
{
"epoch": 0.9714285714285714,
"grad_norm": 17.527340590112377,
"learning_rate": 4.927280601070113e-06,
"loss": 1.2248,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 51,
"tokens_per_second_per_gpu": 3582.22
},
{
"epoch": 0.9904761904761905,
"grad_norm": 19.222165420352905,
"learning_rate": 4.796743940415344e-06,
"loss": 1.2254,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 52,
"tokens_per_second_per_gpu": 3727.73
},
{
"epoch": 1.0,
"grad_norm": 16.84364160949164,
"learning_rate": 4.66531741833252e-06,
"loss": 1.242,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.17,
"memory/max_allocated (GiB)": 122.81,
"step": 53,
"tokens_per_second_per_gpu": 3750.91
},
{
"epoch": 1.019047619047619,
"grad_norm": 25.10526965511846,
"learning_rate": 4.533147821495116e-06,
"loss": 1.2426,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 54,
"tokens_per_second_per_gpu": 3667.97
},
{
"epoch": 1.0380952380952382,
"grad_norm": 24.822314802816855,
"learning_rate": 4.400382766496394e-06,
"loss": 1.2394,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 55,
"tokens_per_second_per_gpu": 3712.75
},
{
"epoch": 1.0571428571428572,
"grad_norm": 19.222938204469422,
"learning_rate": 4.267170534980487e-06,
"loss": 1.2269,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 56,
"tokens_per_second_per_gpu": 3874.53
},
{
"epoch": 1.0761904761904761,
"grad_norm": 14.962813195503772,
"learning_rate": 4.133659908030698e-06,
"loss": 1.233,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 57,
"tokens_per_second_per_gpu": 3626.61
},
{
"epoch": 1.0952380952380953,
"grad_norm": 23.099619927044888,
"learning_rate": 4e-06,
"loss": 1.2353,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 58,
"tokens_per_second_per_gpu": 3631.82
},
{
"epoch": 1.1142857142857143,
"grad_norm": 14.683578827379744,
"learning_rate": 3.8663400919693026e-06,
"loss": 1.2261,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 59,
"tokens_per_second_per_gpu": 3778.88
},
{
"epoch": 1.1333333333333333,
"grad_norm": 1363.244724375689,
"learning_rate": 3.7328294650195136e-06,
"loss": 1.2448,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 60,
"tokens_per_second_per_gpu": 3648.86
},
{
"epoch": 1.1523809523809523,
"grad_norm": 37.56736283967858,
"learning_rate": 3.5996172335036064e-06,
"loss": 1.2134,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 61,
"tokens_per_second_per_gpu": 3680.45
},
{
"epoch": 1.1714285714285715,
"grad_norm": 24.14759116678243,
"learning_rate": 3.4668521785048856e-06,
"loss": 1.2201,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 62,
"tokens_per_second_per_gpu": 3742.93
},
{
"epoch": 1.1904761904761905,
"grad_norm": 20.895518933622306,
"learning_rate": 3.3346825816674796e-06,
"loss": 1.2248,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 63,
"tokens_per_second_per_gpu": 3729.87
},
{
"epoch": 1.2095238095238094,
"grad_norm": 20.07417789192824,
"learning_rate": 3.2032560595846563e-06,
"loss": 1.2253,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 64,
"tokens_per_second_per_gpu": 3664.34
},
{
"epoch": 1.2285714285714286,
"grad_norm": 14.61511907498168,
"learning_rate": 3.0727193989298864e-06,
"loss": 1.241,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 65,
"tokens_per_second_per_gpu": 3721.56
},
{
"epoch": 1.2476190476190476,
"grad_norm": 18.1080641996899,
"learning_rate": 2.943218392514856e-06,
"loss": 1.2027,
"memory/device_reserved (GiB)": 127.34,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 66,
"tokens_per_second_per_gpu": 3589.14
},
{
"epoch": 1.2666666666666666,
"grad_norm": 88.35410261817876,
"learning_rate": 2.8148976764574643e-06,
"loss": 1.221,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 67,
"tokens_per_second_per_gpu": 3718.05
},
{
"epoch": 1.2857142857142856,
"grad_norm": 23.72041286077318,
"learning_rate": 2.6879005686417232e-06,
"loss": 1.2172,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 68,
"tokens_per_second_per_gpu": 3764.91
},
{
"epoch": 1.3047619047619048,
"grad_norm": 43.54234028579835,
"learning_rate": 2.5623689086499492e-06,
"loss": 1.2326,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 69,
"tokens_per_second_per_gpu": 3359.73
},
{
"epoch": 1.3238095238095238,
"grad_norm": 6.104685395227184,
"learning_rate": 2.4384428993460207e-06,
"loss": 1.2427,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 70,
"tokens_per_second_per_gpu": 3681.16
},
{
"epoch": 1.342857142857143,
"grad_norm": 9.963394838549585,
"learning_rate": 2.3162609502866607e-06,
"loss": 1.2322,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 71,
"tokens_per_second_per_gpu": 3753.07
},
{
"epoch": 1.361904761904762,
"grad_norm": 43.43949979845249,
"learning_rate": 2.195959523135592e-06,
"loss": 1.2383,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 72,
"tokens_per_second_per_gpu": 3764.97
},
{
"epoch": 1.380952380952381,
"grad_norm": 14.107017331391786,
"learning_rate": 2.077672979253265e-06,
"loss": 1.2225,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 73,
"tokens_per_second_per_gpu": 3751.34
},
{
"epoch": 1.4,
"grad_norm": 10.549323906590455,
"learning_rate": 1.96153342963233e-06,
"loss": 1.2214,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 74,
"tokens_per_second_per_gpu": 3559.51
},
{
"epoch": 1.4190476190476191,
"grad_norm": 18.592940657981064,
"learning_rate": 1.8476705873465096e-06,
"loss": 1.2171,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 75,
"tokens_per_second_per_gpu": 3629.78
},
{
"epoch": 1.438095238095238,
"grad_norm": 11.120257290964485,
"learning_rate": 1.7362116226776187e-06,
"loss": 1.2226,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 76,
"tokens_per_second_per_gpu": 3603.12
},
{
"epoch": 1.457142857142857,
"grad_norm": 7.078043688121306,
"learning_rate": 1.627281021082579e-06,
"loss": 1.2345,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 77,
"tokens_per_second_per_gpu": 3780.85
},
{
"epoch": 1.4761904761904763,
"grad_norm": 5.000285151965608,
"learning_rate": 1.521000444159021e-06,
"loss": 1.2116,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 78,
"tokens_per_second_per_gpu": 3695.41
},
{
"epoch": 1.4952380952380953,
"grad_norm": 47.84624251792891,
"learning_rate": 1.4174885937647903e-06,
"loss": 1.2405,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 79,
"tokens_per_second_per_gpu": 3605.95
},
{
"epoch": 1.5142857142857142,
"grad_norm": 12.461343395029726,
"learning_rate": 1.316861079443107e-06,
"loss": 1.2272,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 80,
"tokens_per_second_per_gpu": 3613.63
},
{
"epoch": 1.5333333333333332,
"grad_norm": 7.656217867750634,
"learning_rate": 1.2192302893014308e-06,
"loss": 1.2265,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 81,
"tokens_per_second_per_gpu": 3752.87
},
{
"epoch": 1.5523809523809524,
"grad_norm": 15.082668616044355,
"learning_rate": 1.1247052644882832e-06,
"loss": 1.2183,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 82,
"tokens_per_second_per_gpu": 3677.86
},
{
"epoch": 1.5714285714285714,
"grad_norm": 16.44949015042616,
"learning_rate": 1.0333915774081697e-06,
"loss": 1.2099,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.83,
"memory/max_allocated (GiB)": 122.82,
"step": 83,
"tokens_per_second_per_gpu": 3729.01
},
{
"epoch": 1.5904761904761906,
"grad_norm": 12.211227945509856,
"learning_rate": 9.453912138106721e-07,
"loss": 1.2231,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 84,
"tokens_per_second_per_gpu": 3649.49
},
{
"epoch": 1.6095238095238096,
"grad_norm": 7.074192518964132,
"learning_rate": 8.60802458885356e-07,
"loss": 1.237,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 85,
"tokens_per_second_per_gpu": 3783.87
},
{
"epoch": 1.6285714285714286,
"grad_norm": 13.131068251165631,
"learning_rate": 7.797197874897485e-07,
"loss": 1.2116,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 86,
"tokens_per_second_per_gpu": 3671.48
},
{
"epoch": 1.6476190476190475,
"grad_norm": 15.417850715738988,
"learning_rate": 7.022337586329596e-07,
"loss": 1.2209,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 87,
"tokens_per_second_per_gpu": 3805.45
},
{
"epoch": 1.6666666666666665,
"grad_norm": 24.13403904325753,
"learning_rate": 6.28430914332812e-07,
"loss": 1.217,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 88,
"tokens_per_second_per_gpu": 3706.88
},
{
"epoch": 1.6857142857142857,
"grad_norm": 13.576166990616798,
"learning_rate": 5.583936829594433e-07,
"loss": 1.2017,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 89,
"tokens_per_second_per_gpu": 3820.7
},
{
"epoch": 1.704761904761905,
"grad_norm": 8.573005189398867,
"learning_rate": 4.92200287173314e-07,
"loss": 1.2096,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 90,
"tokens_per_second_per_gpu": 3759.45
},
{
"epoch": 1.723809523809524,
"grad_norm": 5.5800010726124025,
"learning_rate": 4.299246565604755e-07,
"loss": 1.2218,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 91,
"tokens_per_second_per_gpu": 3499.8
},
{
"epoch": 1.7428571428571429,
"grad_norm": 6.765368030458938,
"learning_rate": 3.716363450626372e-07,
"loss": 1.2117,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 92,
"tokens_per_second_per_gpu": 3468.37
},
{
"epoch": 1.7619047619047619,
"grad_norm": 7.504548685452772,
"learning_rate": 3.174004532942844e-07,
"loss": 1.2299,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 93,
"tokens_per_second_per_gpu": 3700.98
},
{
"epoch": 1.7809523809523808,
"grad_norm": 8.649122371866438,
"learning_rate": 2.672775558335898e-07,
"loss": 1.2265,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 94,
"tokens_per_second_per_gpu": 3700.55
},
{
"epoch": 1.8,
"grad_norm": 11.91832294221251,
"learning_rate": 2.2132363356832528e-07,
"loss": 1.2185,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 95,
"tokens_per_second_per_gpu": 3680.96
},
{
"epoch": 1.819047619047619,
"grad_norm": 9.186156818821193,
"learning_rate": 1.795900111723503e-07,
"loss": 1.2008,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 96,
"tokens_per_second_per_gpu": 3658.84
},
{
"epoch": 1.8380952380952382,
"grad_norm": 13.72977541399496,
"learning_rate": 1.4212329978249415e-07,
"loss": 1.2104,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.99,
"memory/max_allocated (GiB)": 122.82,
"step": 97,
"tokens_per_second_per_gpu": 3581.27
},
{
"epoch": 1.8571428571428572,
"grad_norm": 6.290457692715211,
"learning_rate": 1.0896534493985177e-07,
"loss": 1.223,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 98,
"tokens_per_second_per_gpu": 3791.37
},
{
"epoch": 1.8761904761904762,
"grad_norm": 9.702798624165407,
"learning_rate": 8.0153179853653e-08,
"loss": 1.2285,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 99,
"tokens_per_second_per_gpu": 3639.09
},
{
"epoch": 1.8952380952380952,
"grad_norm": 11.005975725667684,
"learning_rate": 5.571898403988573e-08,
"loss": 1.2151,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 100,
"tokens_per_second_per_gpu": 3757.98
},
{
"epoch": 1.9142857142857141,
"grad_norm": 8.44842365055977,
"learning_rate": 3.569004738087988e-08,
"loss": 1.2238,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 101,
"tokens_per_second_per_gpu": 3661.88
},
{
"epoch": 1.9333333333333333,
"grad_norm": 4.816542675360639,
"learning_rate": 2.0088739645983455e-08,
"loss": 1.2232,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 102,
"tokens_per_second_per_gpu": 3730.52
},
{
"epoch": 1.9523809523809523,
"grad_norm": 11.749396247795026,
"learning_rate": 8.932485507387344e-09,
"loss": 1.2118,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 103,
"tokens_per_second_per_gpu": 3871.77
},
{
"epoch": 1.9714285714285715,
"grad_norm": 7.9532371124526104,
"learning_rate": 2.2337450789815526e-09,
"loss": 1.2109,
"memory/device_reserved (GiB)": 127.42,
"memory/max_active (GiB)": 124.18,
"memory/max_allocated (GiB)": 122.82,
"step": 104,
"tokens_per_second_per_gpu": 3582.44
}
],
"logging_steps": 1,
"max_steps": 104,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 13,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1428859668922368.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}