| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9714285714285715, | |
| "eval_steps": 500, | |
| "global_step": 104, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01904761904761905, | |
| "grad_norm": 1710495.2036469786, | |
| "learning_rate": 0.0, | |
| "loss": 1.3566, | |
| "memory/device_reserved (GiB)": 126.71, | |
| "memory/max_active (GiB)": 124.13, | |
| "memory/max_allocated (GiB)": 122.77, | |
| "step": 1, | |
| "tokens_per_second_per_gpu": 3497.88 | |
| }, | |
| { | |
| "epoch": 0.0380952380952381, | |
| "grad_norm": 1558119.299961758, | |
| "learning_rate": 8e-07, | |
| "loss": 1.3538, | |
| "memory/device_reserved (GiB)": 126.73, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 2, | |
| "tokens_per_second_per_gpu": 3711.34 | |
| }, | |
| { | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 4186.518498313145, | |
| "learning_rate": 1.6e-06, | |
| "loss": 1.3529, | |
| "memory/device_reserved (GiB)": 126.73, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 3, | |
| "tokens_per_second_per_gpu": 3869.09 | |
| }, | |
| { | |
| "epoch": 0.0761904761904762, | |
| "grad_norm": 1230.1393406412694, | |
| "learning_rate": 2.4e-06, | |
| "loss": 1.3622, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.96, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 4, | |
| "tokens_per_second_per_gpu": 3629.53 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 1035.6723923215748, | |
| "learning_rate": 3.2e-06, | |
| "loss": 1.3487, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 5, | |
| "tokens_per_second_per_gpu": 3634.6 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 432.5460621726683, | |
| "learning_rate": 4e-06, | |
| "loss": 1.3432, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 6, | |
| "tokens_per_second_per_gpu": 3776.49 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 1638.0331848931094, | |
| "learning_rate": 4.8e-06, | |
| "loss": 1.3677, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 7, | |
| "tokens_per_second_per_gpu": 3647.88 | |
| }, | |
| { | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 2883.046779503214, | |
| "learning_rate": 5.6e-06, | |
| "loss": 1.3444, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 8, | |
| "tokens_per_second_per_gpu": 3677.14 | |
| }, | |
| { | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 478.1216745871938, | |
| "learning_rate": 6.4e-06, | |
| "loss": 1.3305, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 9, | |
| "tokens_per_second_per_gpu": 3739.63 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 1025.7505155071237, | |
| "learning_rate": 7.2e-06, | |
| "loss": 1.3362, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 10, | |
| "tokens_per_second_per_gpu": 3730.56 | |
| }, | |
| { | |
| "epoch": 0.20952380952380953, | |
| "grad_norm": 1209.6274892436668, | |
| "learning_rate": 8e-06, | |
| "loss": 1.3325, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 11, | |
| "tokens_per_second_per_gpu": 3661.21 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 1213.936189837833, | |
| "learning_rate": 7.997766254921018e-06, | |
| "loss": 1.3575, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 12, | |
| "tokens_per_second_per_gpu": 3716.18 | |
| }, | |
| { | |
| "epoch": 0.24761904761904763, | |
| "grad_norm": 942.8786617202861, | |
| "learning_rate": 7.991067514492613e-06, | |
| "loss": 1.3145, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 13, | |
| "tokens_per_second_per_gpu": 3587.32 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 3603.68277269405, | |
| "learning_rate": 7.979911260354016e-06, | |
| "loss": 1.3402, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 14, | |
| "tokens_per_second_per_gpu": 3712.99 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 2487.402838754216, | |
| "learning_rate": 7.96430995261912e-06, | |
| "loss": 1.2956, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 15, | |
| "tokens_per_second_per_gpu": 3762.61 | |
| }, | |
| { | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 667.5903281250161, | |
| "learning_rate": 7.944281015960114e-06, | |
| "loss": 1.2992, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 16, | |
| "tokens_per_second_per_gpu": 3358.5 | |
| }, | |
| { | |
| "epoch": 0.3238095238095238, | |
| "grad_norm": 167.9027323688511, | |
| "learning_rate": 7.919846820146347e-06, | |
| "loss": 1.3119, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 17, | |
| "tokens_per_second_per_gpu": 3675.6 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 47.46189855084341, | |
| "learning_rate": 7.891034655060149e-06, | |
| "loss": 1.302, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 18, | |
| "tokens_per_second_per_gpu": 3754.58 | |
| }, | |
| { | |
| "epoch": 0.3619047619047619, | |
| "grad_norm": 115.37054783431222, | |
| "learning_rate": 7.857876700217507e-06, | |
| "loss": 1.3066, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 19, | |
| "tokens_per_second_per_gpu": 3763.17 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 72.03472195336599, | |
| "learning_rate": 7.820409988827649e-06, | |
| "loss": 1.2876, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 20, | |
| "tokens_per_second_per_gpu": 3750.15 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 123.88987560365385, | |
| "learning_rate": 7.778676366431674e-06, | |
| "loss": 1.2854, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 21, | |
| "tokens_per_second_per_gpu": 3556.91 | |
| }, | |
| { | |
| "epoch": 0.41904761904761906, | |
| "grad_norm": 36.51030416393311, | |
| "learning_rate": 7.73272244416641e-06, | |
| "loss": 1.2799, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 22, | |
| "tokens_per_second_per_gpu": 3627.03 | |
| }, | |
| { | |
| "epoch": 0.4380952380952381, | |
| "grad_norm": 37.445205147197846, | |
| "learning_rate": 7.682599546705715e-06, | |
| "loss": 1.2835, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 23, | |
| "tokens_per_second_per_gpu": 3604.91 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 39.93974794828826, | |
| "learning_rate": 7.628363654937363e-06, | |
| "loss": 1.2947, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 24, | |
| "tokens_per_second_per_gpu": 3782.86 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 59.41355630536809, | |
| "learning_rate": 7.570075343439524e-06, | |
| "loss": 1.2702, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 25, | |
| "tokens_per_second_per_gpu": 3694.52 | |
| }, | |
| { | |
| "epoch": 0.49523809523809526, | |
| "grad_norm": 34.32373819297229, | |
| "learning_rate": 7.507799712826686e-06, | |
| "loss": 1.2984, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 26, | |
| "tokens_per_second_per_gpu": 3613.01 | |
| }, | |
| { | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 21.68779916764309, | |
| "learning_rate": 7.441606317040558e-06, | |
| "loss": 1.2827, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 27, | |
| "tokens_per_second_per_gpu": 3616.18 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 30.472648556953168, | |
| "learning_rate": 7.371569085667188e-06, | |
| "loss": 1.2801, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 28, | |
| "tokens_per_second_per_gpu": 3754.99 | |
| }, | |
| { | |
| "epoch": 0.5523809523809524, | |
| "grad_norm": 19.319274693345776, | |
| "learning_rate": 7.297766241367041e-06, | |
| "loss": 1.2693, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 29, | |
| "tokens_per_second_per_gpu": 3677.68 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 34.31430237097932, | |
| "learning_rate": 7.220280212510252e-06, | |
| "loss": 1.2581, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 30, | |
| "tokens_per_second_per_gpu": 3730.31 | |
| }, | |
| { | |
| "epoch": 0.5904761904761905, | |
| "grad_norm": 82.8518096206661, | |
| "learning_rate": 7.139197541114644e-06, | |
| "loss": 1.2687, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 31, | |
| "tokens_per_second_per_gpu": 3650.37 | |
| }, | |
| { | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 36.99675013730897, | |
| "learning_rate": 7.0546087861893285e-06, | |
| "loss": 1.2809, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 32, | |
| "tokens_per_second_per_gpu": 3785.35 | |
| }, | |
| { | |
| "epoch": 0.6285714285714286, | |
| "grad_norm": 10.853195813384238, | |
| "learning_rate": 6.96660842259183e-06, | |
| "loss": 1.253, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 33, | |
| "tokens_per_second_per_gpu": 3666.64 | |
| }, | |
| { | |
| "epoch": 0.6476190476190476, | |
| "grad_norm": 27.05353511161411, | |
| "learning_rate": 6.875294735511717e-06, | |
| "loss": 1.2601, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 34, | |
| "tokens_per_second_per_gpu": 3808.86 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 11.079685605370564, | |
| "learning_rate": 6.780769710698569e-06, | |
| "loss": 1.2539, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 35, | |
| "tokens_per_second_per_gpu": 3708.96 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 35.34021537624741, | |
| "learning_rate": 6.683138920556894e-06, | |
| "loss": 1.2362, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 36, | |
| "tokens_per_second_per_gpu": 3819.32 | |
| }, | |
| { | |
| "epoch": 0.7047619047619048, | |
| "grad_norm": 47.246402607795154, | |
| "learning_rate": 6.582511406235209e-06, | |
| "loss": 1.2429, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 37, | |
| "tokens_per_second_per_gpu": 3762.22 | |
| }, | |
| { | |
| "epoch": 0.7238095238095238, | |
| "grad_norm": 35.65219209343969, | |
| "learning_rate": 6.4789995558409795e-06, | |
| "loss": 1.2535, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 38, | |
| "tokens_per_second_per_gpu": 3496.79 | |
| }, | |
| { | |
| "epoch": 0.7428571428571429, | |
| "grad_norm": 13.147263166038922, | |
| "learning_rate": 6.3727189789174205e-06, | |
| "loss": 1.2421, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 39, | |
| "tokens_per_second_per_gpu": 3471.55 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 8.92693366901581, | |
| "learning_rate": 6.263788377322381e-06, | |
| "loss": 1.2587, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 40, | |
| "tokens_per_second_per_gpu": 3700.61 | |
| }, | |
| { | |
| "epoch": 0.780952380952381, | |
| "grad_norm": 25.621463437533773, | |
| "learning_rate": 6.152329412653491e-06, | |
| "loss": 1.2535, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 41, | |
| "tokens_per_second_per_gpu": 3696.17 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 21.356947105637357, | |
| "learning_rate": 6.038466570367669e-06, | |
| "loss": 1.2437, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 42, | |
| "tokens_per_second_per_gpu": 3679.52 | |
| }, | |
| { | |
| "epoch": 0.819047619047619, | |
| "grad_norm": 21.528748134497796, | |
| "learning_rate": 5.922327020746735e-06, | |
| "loss": 1.2243, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 43, | |
| "tokens_per_second_per_gpu": 3654.06 | |
| }, | |
| { | |
| "epoch": 0.8380952380952381, | |
| "grad_norm": 14.734257530424147, | |
| "learning_rate": 5.804040476864407e-06, | |
| "loss": 1.2326, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 44, | |
| "tokens_per_second_per_gpu": 3581.66 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 13.129280834101875, | |
| "learning_rate": 5.68373904971334e-06, | |
| "loss": 1.2442, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 45, | |
| "tokens_per_second_per_gpu": 3788.2 | |
| }, | |
| { | |
| "epoch": 0.8761904761904762, | |
| "grad_norm": 14.976302382446457, | |
| "learning_rate": 5.561557100653979e-06, | |
| "loss": 1.2486, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 46, | |
| "tokens_per_second_per_gpu": 3636.88 | |
| }, | |
| { | |
| "epoch": 0.8952380952380953, | |
| "grad_norm": 15.967232506668388, | |
| "learning_rate": 5.43763109135005e-06, | |
| "loss": 1.2338, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 47, | |
| "tokens_per_second_per_gpu": 3759.31 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 16.354797247719976, | |
| "learning_rate": 5.312099431358276e-06, | |
| "loss": 1.2413, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 48, | |
| "tokens_per_second_per_gpu": 3663.89 | |
| }, | |
| { | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 6.665663198954394, | |
| "learning_rate": 5.185102323542536e-06, | |
| "loss": 1.2395, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 49, | |
| "tokens_per_second_per_gpu": 3727.2 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 9.1334624753648, | |
| "learning_rate": 5.056781607485144e-06, | |
| "loss": 1.2268, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 50, | |
| "tokens_per_second_per_gpu": 3870.66 | |
| }, | |
| { | |
| "epoch": 0.9714285714285714, | |
| "grad_norm": 17.527340590112377, | |
| "learning_rate": 4.927280601070113e-06, | |
| "loss": 1.2248, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 51, | |
| "tokens_per_second_per_gpu": 3582.22 | |
| }, | |
| { | |
| "epoch": 0.9904761904761905, | |
| "grad_norm": 19.222165420352905, | |
| "learning_rate": 4.796743940415344e-06, | |
| "loss": 1.2254, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 52, | |
| "tokens_per_second_per_gpu": 3727.73 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 16.84364160949164, | |
| "learning_rate": 4.66531741833252e-06, | |
| "loss": 1.242, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.17, | |
| "memory/max_allocated (GiB)": 122.81, | |
| "step": 53, | |
| "tokens_per_second_per_gpu": 3750.91 | |
| }, | |
| { | |
| "epoch": 1.019047619047619, | |
| "grad_norm": 25.10526965511846, | |
| "learning_rate": 4.533147821495116e-06, | |
| "loss": 1.2426, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 54, | |
| "tokens_per_second_per_gpu": 3667.97 | |
| }, | |
| { | |
| "epoch": 1.0380952380952382, | |
| "grad_norm": 24.822314802816855, | |
| "learning_rate": 4.400382766496394e-06, | |
| "loss": 1.2394, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 55, | |
| "tokens_per_second_per_gpu": 3712.75 | |
| }, | |
| { | |
| "epoch": 1.0571428571428572, | |
| "grad_norm": 19.222938204469422, | |
| "learning_rate": 4.267170534980487e-06, | |
| "loss": 1.2269, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 56, | |
| "tokens_per_second_per_gpu": 3874.53 | |
| }, | |
| { | |
| "epoch": 1.0761904761904761, | |
| "grad_norm": 14.962813195503772, | |
| "learning_rate": 4.133659908030698e-06, | |
| "loss": 1.233, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 57, | |
| "tokens_per_second_per_gpu": 3626.61 | |
| }, | |
| { | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 23.099619927044888, | |
| "learning_rate": 4e-06, | |
| "loss": 1.2353, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 58, | |
| "tokens_per_second_per_gpu": 3631.82 | |
| }, | |
| { | |
| "epoch": 1.1142857142857143, | |
| "grad_norm": 14.683578827379744, | |
| "learning_rate": 3.8663400919693026e-06, | |
| "loss": 1.2261, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 59, | |
| "tokens_per_second_per_gpu": 3778.88 | |
| }, | |
| { | |
| "epoch": 1.1333333333333333, | |
| "grad_norm": 1363.244724375689, | |
| "learning_rate": 3.7328294650195136e-06, | |
| "loss": 1.2448, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 60, | |
| "tokens_per_second_per_gpu": 3648.86 | |
| }, | |
| { | |
| "epoch": 1.1523809523809523, | |
| "grad_norm": 37.56736283967858, | |
| "learning_rate": 3.5996172335036064e-06, | |
| "loss": 1.2134, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 61, | |
| "tokens_per_second_per_gpu": 3680.45 | |
| }, | |
| { | |
| "epoch": 1.1714285714285715, | |
| "grad_norm": 24.14759116678243, | |
| "learning_rate": 3.4668521785048856e-06, | |
| "loss": 1.2201, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 62, | |
| "tokens_per_second_per_gpu": 3742.93 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 20.895518933622306, | |
| "learning_rate": 3.3346825816674796e-06, | |
| "loss": 1.2248, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 63, | |
| "tokens_per_second_per_gpu": 3729.87 | |
| }, | |
| { | |
| "epoch": 1.2095238095238094, | |
| "grad_norm": 20.07417789192824, | |
| "learning_rate": 3.2032560595846563e-06, | |
| "loss": 1.2253, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 64, | |
| "tokens_per_second_per_gpu": 3664.34 | |
| }, | |
| { | |
| "epoch": 1.2285714285714286, | |
| "grad_norm": 14.61511907498168, | |
| "learning_rate": 3.0727193989298864e-06, | |
| "loss": 1.241, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 65, | |
| "tokens_per_second_per_gpu": 3721.56 | |
| }, | |
| { | |
| "epoch": 1.2476190476190476, | |
| "grad_norm": 18.1080641996899, | |
| "learning_rate": 2.943218392514856e-06, | |
| "loss": 1.2027, | |
| "memory/device_reserved (GiB)": 127.34, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 66, | |
| "tokens_per_second_per_gpu": 3589.14 | |
| }, | |
| { | |
| "epoch": 1.2666666666666666, | |
| "grad_norm": 88.35410261817876, | |
| "learning_rate": 2.8148976764574643e-06, | |
| "loss": 1.221, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 67, | |
| "tokens_per_second_per_gpu": 3718.05 | |
| }, | |
| { | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 23.72041286077318, | |
| "learning_rate": 2.6879005686417232e-06, | |
| "loss": 1.2172, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 68, | |
| "tokens_per_second_per_gpu": 3764.91 | |
| }, | |
| { | |
| "epoch": 1.3047619047619048, | |
| "grad_norm": 43.54234028579835, | |
| "learning_rate": 2.5623689086499492e-06, | |
| "loss": 1.2326, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 69, | |
| "tokens_per_second_per_gpu": 3359.73 | |
| }, | |
| { | |
| "epoch": 1.3238095238095238, | |
| "grad_norm": 6.104685395227184, | |
| "learning_rate": 2.4384428993460207e-06, | |
| "loss": 1.2427, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 70, | |
| "tokens_per_second_per_gpu": 3681.16 | |
| }, | |
| { | |
| "epoch": 1.342857142857143, | |
| "grad_norm": 9.963394838549585, | |
| "learning_rate": 2.3162609502866607e-06, | |
| "loss": 1.2322, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 71, | |
| "tokens_per_second_per_gpu": 3753.07 | |
| }, | |
| { | |
| "epoch": 1.361904761904762, | |
| "grad_norm": 43.43949979845249, | |
| "learning_rate": 2.195959523135592e-06, | |
| "loss": 1.2383, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 72, | |
| "tokens_per_second_per_gpu": 3764.97 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 14.107017331391786, | |
| "learning_rate": 2.077672979253265e-06, | |
| "loss": 1.2225, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 73, | |
| "tokens_per_second_per_gpu": 3751.34 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 10.549323906590455, | |
| "learning_rate": 1.96153342963233e-06, | |
| "loss": 1.2214, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 74, | |
| "tokens_per_second_per_gpu": 3559.51 | |
| }, | |
| { | |
| "epoch": 1.4190476190476191, | |
| "grad_norm": 18.592940657981064, | |
| "learning_rate": 1.8476705873465096e-06, | |
| "loss": 1.2171, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 75, | |
| "tokens_per_second_per_gpu": 3629.78 | |
| }, | |
| { | |
| "epoch": 1.438095238095238, | |
| "grad_norm": 11.120257290964485, | |
| "learning_rate": 1.7362116226776187e-06, | |
| "loss": 1.2226, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 76, | |
| "tokens_per_second_per_gpu": 3603.12 | |
| }, | |
| { | |
| "epoch": 1.457142857142857, | |
| "grad_norm": 7.078043688121306, | |
| "learning_rate": 1.627281021082579e-06, | |
| "loss": 1.2345, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 77, | |
| "tokens_per_second_per_gpu": 3780.85 | |
| }, | |
| { | |
| "epoch": 1.4761904761904763, | |
| "grad_norm": 5.000285151965608, | |
| "learning_rate": 1.521000444159021e-06, | |
| "loss": 1.2116, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 78, | |
| "tokens_per_second_per_gpu": 3695.41 | |
| }, | |
| { | |
| "epoch": 1.4952380952380953, | |
| "grad_norm": 47.84624251792891, | |
| "learning_rate": 1.4174885937647903e-06, | |
| "loss": 1.2405, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 79, | |
| "tokens_per_second_per_gpu": 3605.95 | |
| }, | |
| { | |
| "epoch": 1.5142857142857142, | |
| "grad_norm": 12.461343395029726, | |
| "learning_rate": 1.316861079443107e-06, | |
| "loss": 1.2272, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 80, | |
| "tokens_per_second_per_gpu": 3613.63 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 7.656217867750634, | |
| "learning_rate": 1.2192302893014308e-06, | |
| "loss": 1.2265, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 81, | |
| "tokens_per_second_per_gpu": 3752.87 | |
| }, | |
| { | |
| "epoch": 1.5523809523809524, | |
| "grad_norm": 15.082668616044355, | |
| "learning_rate": 1.1247052644882832e-06, | |
| "loss": 1.2183, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 82, | |
| "tokens_per_second_per_gpu": 3677.86 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 16.44949015042616, | |
| "learning_rate": 1.0333915774081697e-06, | |
| "loss": 1.2099, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.83, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 83, | |
| "tokens_per_second_per_gpu": 3729.01 | |
| }, | |
| { | |
| "epoch": 1.5904761904761906, | |
| "grad_norm": 12.211227945509856, | |
| "learning_rate": 9.453912138106721e-07, | |
| "loss": 1.2231, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 84, | |
| "tokens_per_second_per_gpu": 3649.49 | |
| }, | |
| { | |
| "epoch": 1.6095238095238096, | |
| "grad_norm": 7.074192518964132, | |
| "learning_rate": 8.60802458885356e-07, | |
| "loss": 1.237, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 85, | |
| "tokens_per_second_per_gpu": 3783.87 | |
| }, | |
| { | |
| "epoch": 1.6285714285714286, | |
| "grad_norm": 13.131068251165631, | |
| "learning_rate": 7.797197874897485e-07, | |
| "loss": 1.2116, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 86, | |
| "tokens_per_second_per_gpu": 3671.48 | |
| }, | |
| { | |
| "epoch": 1.6476190476190475, | |
| "grad_norm": 15.417850715738988, | |
| "learning_rate": 7.022337586329596e-07, | |
| "loss": 1.2209, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 87, | |
| "tokens_per_second_per_gpu": 3805.45 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 24.13403904325753, | |
| "learning_rate": 6.28430914332812e-07, | |
| "loss": 1.217, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 88, | |
| "tokens_per_second_per_gpu": 3706.88 | |
| }, | |
| { | |
| "epoch": 1.6857142857142857, | |
| "grad_norm": 13.576166990616798, | |
| "learning_rate": 5.583936829594433e-07, | |
| "loss": 1.2017, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 89, | |
| "tokens_per_second_per_gpu": 3820.7 | |
| }, | |
| { | |
| "epoch": 1.704761904761905, | |
| "grad_norm": 8.573005189398867, | |
| "learning_rate": 4.92200287173314e-07, | |
| "loss": 1.2096, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 90, | |
| "tokens_per_second_per_gpu": 3759.45 | |
| }, | |
| { | |
| "epoch": 1.723809523809524, | |
| "grad_norm": 5.5800010726124025, | |
| "learning_rate": 4.299246565604755e-07, | |
| "loss": 1.2218, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 91, | |
| "tokens_per_second_per_gpu": 3499.8 | |
| }, | |
| { | |
| "epoch": 1.7428571428571429, | |
| "grad_norm": 6.765368030458938, | |
| "learning_rate": 3.716363450626372e-07, | |
| "loss": 1.2117, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 92, | |
| "tokens_per_second_per_gpu": 3468.37 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 7.504548685452772, | |
| "learning_rate": 3.174004532942844e-07, | |
| "loss": 1.2299, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 93, | |
| "tokens_per_second_per_gpu": 3700.98 | |
| }, | |
| { | |
| "epoch": 1.7809523809523808, | |
| "grad_norm": 8.649122371866438, | |
| "learning_rate": 2.672775558335898e-07, | |
| "loss": 1.2265, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 94, | |
| "tokens_per_second_per_gpu": 3700.55 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 11.91832294221251, | |
| "learning_rate": 2.2132363356832528e-07, | |
| "loss": 1.2185, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 95, | |
| "tokens_per_second_per_gpu": 3680.96 | |
| }, | |
| { | |
| "epoch": 1.819047619047619, | |
| "grad_norm": 9.186156818821193, | |
| "learning_rate": 1.795900111723503e-07, | |
| "loss": 1.2008, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 96, | |
| "tokens_per_second_per_gpu": 3658.84 | |
| }, | |
| { | |
| "epoch": 1.8380952380952382, | |
| "grad_norm": 13.72977541399496, | |
| "learning_rate": 1.4212329978249415e-07, | |
| "loss": 1.2104, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.99, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 97, | |
| "tokens_per_second_per_gpu": 3581.27 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 6.290457692715211, | |
| "learning_rate": 1.0896534493985177e-07, | |
| "loss": 1.223, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 98, | |
| "tokens_per_second_per_gpu": 3791.37 | |
| }, | |
| { | |
| "epoch": 1.8761904761904762, | |
| "grad_norm": 9.702798624165407, | |
| "learning_rate": 8.0153179853653e-08, | |
| "loss": 1.2285, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 99, | |
| "tokens_per_second_per_gpu": 3639.09 | |
| }, | |
| { | |
| "epoch": 1.8952380952380952, | |
| "grad_norm": 11.005975725667684, | |
| "learning_rate": 5.571898403988573e-08, | |
| "loss": 1.2151, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 100, | |
| "tokens_per_second_per_gpu": 3757.98 | |
| }, | |
| { | |
| "epoch": 1.9142857142857141, | |
| "grad_norm": 8.44842365055977, | |
| "learning_rate": 3.569004738087988e-08, | |
| "loss": 1.2238, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 101, | |
| "tokens_per_second_per_gpu": 3661.88 | |
| }, | |
| { | |
| "epoch": 1.9333333333333333, | |
| "grad_norm": 4.816542675360639, | |
| "learning_rate": 2.0088739645983455e-08, | |
| "loss": 1.2232, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 102, | |
| "tokens_per_second_per_gpu": 3730.52 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 11.749396247795026, | |
| "learning_rate": 8.932485507387344e-09, | |
| "loss": 1.2118, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 103, | |
| "tokens_per_second_per_gpu": 3871.77 | |
| }, | |
| { | |
| "epoch": 1.9714285714285715, | |
| "grad_norm": 7.9532371124526104, | |
| "learning_rate": 2.2337450789815526e-09, | |
| "loss": 1.2109, | |
| "memory/device_reserved (GiB)": 127.42, | |
| "memory/max_active (GiB)": 124.18, | |
| "memory/max_allocated (GiB)": 122.82, | |
| "step": 104, | |
| "tokens_per_second_per_gpu": 3582.44 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 104, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 13, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1428859668922368.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |