303 lines
6.7 KiB
YAML
303 lines
6.7 KiB
YAML
# WARNING: Yaml configs is currently an experimental feature
|
|
language_model:
|
|
# model architecture
|
|
num_layers: 24
|
|
hidden_size: 1024
|
|
num_attention_heads: 16
|
|
num_query_groups: null
|
|
|
|
ffn_hidden_size: null
|
|
kv_channels: null
|
|
hidden_dropout: 0.0
|
|
attention_dropout: 0.0
|
|
fp32_residual_connection: False
|
|
|
|
apply_residual_connection_post_layernorm: False
|
|
layernorm_epsilon: 1.e-5
|
|
layernorm_zero_centered_gamma: True
|
|
add_bias_linear: False
|
|
bias_activation_fusion: False
|
|
add_qkv_bias: False
|
|
gated_linear_unit: False
|
|
activation_func: swiglu
|
|
num_moe_experts: null
|
|
rotary_interleaved: False
|
|
window_size: null
|
|
|
|
# initialization
|
|
init_method: null
|
|
init_method_std: 0.02
|
|
output_layer_init_method: null
|
|
|
|
# mixed-precision
|
|
apply_query_key_layer_scaling: False
|
|
attention_softmax_in_fp32: False
|
|
|
|
# fusion
|
|
bias_swiglu_fusion: True
|
|
masked_softmax_fusion: True
|
|
persist_layer_norm: False
|
|
memory_efficient_layer_norm: False
|
|
bias_dropout_fusion: True
|
|
apply_rope_fusion: True
|
|
|
|
# activation recomputation
|
|
recompute_granularity: null
|
|
recompute_method: null
|
|
recompute_num_layers: null
|
|
distribute_saved_activations: null
|
|
|
|
# fp8 related
|
|
fp8: null
|
|
fp8_margin: 0
|
|
fp8_interval: 1
|
|
fp8_amax_history_len: 1
|
|
fp8_amax_compute_algo: "most_recent"
|
|
fp8_wgrad: True
|
|
|
|
# miscellaneous
|
|
clone_scatter_output_in_embedding: True
|
|
|
|
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
|
|
|
|
# MoE related
|
|
moe_router_load_balancing_type: "aux_loss"
|
|
moe_router_topk: 2
|
|
moe_grouped_gemm: False
|
|
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
|
|
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
|
|
moe_input_jitter_eps: null
|
|
moe_token_dropping: False
|
|
|
|
model_parallel:
|
|
# Model parallelism
|
|
tensor_model_parallel_size: 1
|
|
context_parallel_size: 1
|
|
pipeline_model_parallel_size: 1
|
|
virtual_pipeline_model_parallel_size: null
|
|
sequence_parallel: True
|
|
expert_model_parallel_size: 1
|
|
|
|
# Initialization
|
|
perform_initialization: True
|
|
use_cpu_initialization: null
|
|
|
|
# Training
|
|
fp16: False
|
|
bf16: True
|
|
params_dtype: null # Set from above arguments for core
|
|
timers: null
|
|
|
|
# Optimizations
|
|
gradient_accumulation_fusion: True
|
|
async_tensor_model_parallel_allreduce: True
|
|
tp_comm_overlap: False
|
|
|
|
# Debug Options
|
|
tp_comm_split_ag: True
|
|
tp_comm_atomic_ag: True
|
|
tp_comm_split_rs: True
|
|
tp_comm_atomic_rs: True
|
|
tp_comm_bulk_wgrad: True
|
|
tp_comm_bulk_dgrad: True
|
|
|
|
# Parallelism
|
|
finalize_model_grads_func: null
|
|
|
|
# Pipeline Parallel
|
|
pipeline_dtype: null
|
|
grad_scale_func: null
|
|
enable_autocast: False
|
|
autocast_dtype: null
|
|
variable_seq_lengths: False
|
|
num_microbatches_with_partial_activation_checkpoints: null
|
|
overlap_p2p_comm: False
|
|
batch_p2p_comm: True
|
|
batch_p2p_sync: True
|
|
use_ring_exchange_p2p: False
|
|
deallocate_pipeline_outputs: False
|
|
no_sync_func: null
|
|
grad_sync_func: null
|
|
param_sync_func: null
|
|
pipeline_model_parallel_split_rank: null
|
|
|
|
# CPU Offloading
|
|
cpu_offloading: False
|
|
cpu_offloading_num_layers: 0
|
|
_cpu_offloading_context: null
|
|
cpu_offloading_weights: False
|
|
cpu_offloading_activations: True
|
|
|
|
# Timing
|
|
barrier_with_L1_time: True
|
|
|
|
# training:
|
|
use_legacy_models: False
|
|
spec: null
|
|
micro_batch_size: 2
|
|
global_batch_size: 128
|
|
rampup_batch_size: [32, 32, 65324160]
|
|
check_for_nan_in_loss_and_grad: True
|
|
num_layers_per_virtual_pipeline_stage: null
|
|
|
|
encoder_num_layers: null
|
|
decoder_num_layers: null
|
|
rotary_seq_len_interpolation_factor: null
|
|
add_position_embedding: False
|
|
make_vocab_size_divisible_by: 128
|
|
group_query_attention: False
|
|
|
|
|
|
exit_signal_handler: False
|
|
exit_duration_in_mins: null
|
|
exit_interval: null
|
|
|
|
untie_embeddings_and_output_weights: True
|
|
position_embedding_type: rope
|
|
rotary_percent: 0.5
|
|
openai_gelu: False
|
|
squared_relu: False
|
|
swiglu: True
|
|
onnx_safe: null
|
|
bert_binary_head: True
|
|
max_position_embeddings: 4096
|
|
|
|
transformer_impl: local
|
|
use_flash_attn: False
|
|
seed: 1234
|
|
data_parallel_random_init: False
|
|
|
|
# Optimizer
|
|
optimizer: adam
|
|
lr: 2.5e-4
|
|
lr_decay_style: cosine
|
|
lr_decay_iters: null
|
|
lr_decay_samples: 255126953
|
|
lr_warmup_fraction: null
|
|
lr_warmup_iters: 0
|
|
lr_warmup_samples: 81381
|
|
lr_warmup_init: 0.0
|
|
min_lr: 2.5e-5
|
|
weight_decay: 0.1
|
|
start_weight_decay: null
|
|
end_weight_decay: null
|
|
weight_decay_incr_style: constant
|
|
clip_grad: 1.0
|
|
adam_beta1: 0.9
|
|
adam_beta2: 0.95
|
|
adam_eps: 1.e-08
|
|
sgd_momentum: 0.9
|
|
override_opt_param_scheduler: False
|
|
use_checkpoint_opt_param_scheduler: False
|
|
|
|
# checkpointing arguments
|
|
save: null
|
|
save_interval: 20000
|
|
no_save_optim: null
|
|
no_save_rng: null
|
|
load: null
|
|
no_load_optim: null
|
|
no_load_rng: null
|
|
finetune: False
|
|
use_checkpoint_args: False
|
|
exit_on_missing_checkpoint: False
|
|
|
|
# loss arguments
|
|
loss_scale: null
|
|
initial_loss_scale: 4294967296
|
|
min_loss_scale: 1.0
|
|
loss_scale_window: 1000
|
|
hysteresis: 2
|
|
accumulate_allreduce_grads_in_fp32: False
|
|
fp16_lm_cross_entropy: False
|
|
|
|
# distributed arguments
|
|
distributed_backend: nccl
|
|
distributed_timeout_minutes: 10
|
|
overlap_grad_reduce: False
|
|
delay_grad_reduce: True
|
|
overlap_param_gather: False
|
|
delay_param_gather: False
|
|
scatter_gather_tensors_in_pipeline: True
|
|
local_rank: null
|
|
lazy_mpu_init: null
|
|
empty_unused_memory_level: 0
|
|
standalone_embedding_stage: False
|
|
use_distributed_optimizer: False
|
|
nccl_communicator_config_path: null
|
|
|
|
train_iters: null
|
|
eval_iters: 32
|
|
eval_interval: 2000
|
|
skip_train: False
|
|
|
|
adlr_autoresume: False
|
|
adlr_autoresume_interval: 1000
|
|
|
|
# garbage collection
|
|
manual_gc: False
|
|
manual_gc_interval: 0
|
|
manual_gc_eval: True
|
|
|
|
tp_comm_overlap_cfg: null
|
|
|
|
#data
|
|
data_path: null
|
|
split: '99,1,0'
|
|
train_data_path: null
|
|
valid_data_path: null
|
|
test_data_path: null
|
|
data_cache_path: null
|
|
mock_data: False
|
|
vocab_size: null
|
|
vocab_file: null
|
|
merge_file: null
|
|
vocab_extra_ids: 0
|
|
seq_length: 4096
|
|
encoder_seq_length: null
|
|
decoder_seq_length: null
|
|
retriever_seq_length: 256
|
|
sample_rate: 1.0
|
|
mask_prob: 0.15
|
|
short_seq_prob: 0.1
|
|
num_workers: 2
|
|
tokenizer_type: GPTSentencePieceTokenizer
|
|
tokenizer_model: null
|
|
reset_position_ids: False
|
|
reset_attention_mask: False
|
|
eod_mask_loss: False
|
|
train_samples: 268554688
|
|
dataloader_type: null
|
|
|
|
#profile:
|
|
profile: False
|
|
profile_ranks: [0]
|
|
profile_step_end: 12
|
|
profile_step_start: 10
|
|
|
|
#logging:
|
|
log_params_norm: True
|
|
log_num_zeros_in_grad: True
|
|
log_throughput: False
|
|
log_progress: False
|
|
timing_log_level: 0
|
|
timing_log_option: minmax
|
|
tensorboard_log_interval: 1
|
|
tensorboard_queue_size: 1000
|
|
log_timers_to_tensorboard: False
|
|
log_batch_size_to_tensorboard: False
|
|
log_learning_rate_to_tensorboard: True
|
|
log_learning_rate_to_tensorboard: True
|
|
log_validation_ppl_to_tensorboard: False
|
|
log_memory_to_tensorboard: False
|
|
log_world_size_to_tensorboard: False
|
|
log_loss_scale_to_tensorboard: True
|
|
wandb_project: ''
|
|
wandb_exp_name: ''
|
|
wandb_save_dir: ''
|
|
enable_one_logger: True
|
|
one_logger_project: megatron-lm
|
|
one_logger_run_name: null
|
|
log_interval: 100
|
|
tensorboard_dir: null
|