AdEMAMix
AdEMAMix is a variant of the Adam
optimizer.
bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted.
AdEMAMix
class bitsandbytes.optim.AdEMAMix
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 optim_bits: Literal = 32 min_8bit_size: int = 4096 is_paged: bool = False )
__init__
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 optim_bits: Literal = 32 min_8bit_size: int = 4096 is_paged: bool = False )
AdEMAMix8bit
class bitsandbytes.optim.AdEMAMix8bit
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 is_paged: bool = False )
__init__
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 is_paged: bool = False )
AdEMAMix32bit
class bitsandbytes.optim.AdEMAMix32bit
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 is_paged: bool = False )
__init__
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 is_paged: bool = False )
PagedAdEMAMix
class bitsandbytes.optim.PagedAdEMAMix
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 optim_bits: Literal = 32 min_8bit_size: int = 4096 )
__init__
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 optim_bits: Literal = 32 min_8bit_size: int = 4096 )
PagedAdEMAMix8bit
class bitsandbytes.optim.PagedAdEMAMix8bit
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 )
__init__
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 )
PagedAdEMAMix32bit
class bitsandbytes.optim.PagedAdEMAMix32bit
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 )
__init__
< source >( params: Iterable lr: float = 0.001 betas: Tuple = (0.9, 0.999, 0.9999) alpha: float = 5.0 t_alpha: Optional = None t_beta3: Optional = None eps: float = 1e-08 weight_decay: float = 0.01 min_8bit_size: int = 4096 )