diff --git a/convert_to_hf.py b/convert_to_hf.py
new file mode 100644
index 00000000..e6eb5df7
--- /dev/null
+++ b/convert_to_hf.py
@@ -0,0 +1,162 @@
+import json
+import re
+from enum import Enum
+from pathlib import Path
+from typing import Annotated
+
+import torch
+import typer
+from composer.models import write_huggingface_pretrained_from_composer_checkpoint
+from typer import Option
+from safetensors.torch import save_file as safetensors_save_file
+
+app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]}, pretty_exceptions_show_locals=False)
+
+
+class TorchDtype(str, Enum):
+    float32 = "float32"
+    float16 = "float16"
+    bfloat16 = "bfloat16"
+
+
+def update_config(
+    source_config: dict,
+    bos_token_id: int,
+    eos_token_id: int,
+    cls_token_id: int,
+    pad_token_id: int,
+    sep_token_id: int,
+    max_length: int,
+    torch_dtype: TorchDtype,
+) -> dict:
+    target_config = {
+        # "_name_or_path": "ModernBERT-base",
+        "architectures": ["ModernBertForMaskedLM"],
+        "attention_bias": source_config["attn_out_bias"],
+        "attention_dropout": source_config["attention_probs_dropout_prob"],
+        "bos_token_id": bos_token_id,
+        "classifier_activation": source_config.get("head_class_act", source_config["hidden_act"]),
+        "classifier_bias": source_config["head_class_bias"],
+        "classifier_dropout": source_config["head_class_dropout"],
+        "classifier_pooling": "mean",
+        "cls_token_id": cls_token_id,
+        "decoder_bias": source_config["decoder_bias"],
+        "deterministic_flash_attn": source_config["deterministic_fa2"],
+        "embedding_dropout": source_config["embed_dropout_prob"],
+        "eos_token_id": eos_token_id,
+        "global_attn_every_n_layers": source_config["global_attn_every_n_layers"],
+        "global_rope_theta": source_config["rotary_emb_base"],
+        "gradient_checkpointing": source_config["gradient_checkpointing"],
+        "hidden_activation": source_config["hidden_act"],
+        "hidden_size": source_config["hidden_size"],
+        "initializer_cutoff_factor": source_config["init_cutoff_factor"],
+        "initializer_range": source_config["initializer_range"],
+        "intermediate_size": source_config["intermediate_size"],
+        "layer_norm_eps": source_config["norm_kwargs"]["eps"],
+        "local_attention": source_config["sliding_window"],
+        "local_rope_theta": source_config["local_attn_rotary_emb_base"]
+        if (
+            source_config["local_attn_rotary_emb_base"]
+            and source_config["local_attn_rotary_emb_base"] != -1
+        )
+        else source_config["rotary_emb_base"],
+        "max_position_embeddings": max_length,  # Override with first config value
+        "mlp_bias": source_config["mlp_in_bias"],
+        "mlp_dropout": source_config["mlp_dropout_prob"],
+        "model_type": "modernbert",
+        "norm_bias": source_config["norm_kwargs"]["bias"],
+        "norm_eps": source_config["norm_kwargs"]["eps"],
+        "num_attention_heads": source_config["num_attention_heads"],
+        "num_hidden_layers": source_config["num_hidden_layers"],
+        "pad_token_id": pad_token_id,
+        "position_embedding_type": source_config["position_embedding_type"],
+        "sep_token_id": sep_token_id,
+        "tie_word_embeddings": source_config.get("tie_word_embeddings", True),
+        "torch_dtype": torch_dtype.value,
+        "transformers_version": "4.48.0",
+        "vocab_size": source_config["vocab_size"],
+    }
+    return target_config
+
+
+@app.command(help="Convert a ModernBERT Composer checkpoint to HuggingFace pretrained format.")
+def main(
+    output_name: Annotated[str, Option(help="Name of the output model", show_default=False)],
+    output_dir: Annotated[Path, Option(help="Path to the output directory", show_default=False)],
+    input_checkpoint: Annotated[Path, Option(help="Path to the ModernBERT Composer checkpoint file", show_default=False)],
+    bos_token_id: Annotated[int, Option(help="ID of the BOS token. Defaults to the ModernBERT BOS token.")] = 50281,
+    eos_token_id: Annotated[int, Option(help="ID of the EOS token. Defaults to the ModernBERT EOS token.")] = 50282,
+    cls_token_id: Annotated[int, Option(help="ID of the CLS token. Defaults to the ModernBERT CLS token.")] = 50281,
+    sep_token_id: Annotated[int, Option(help="ID of the SEP token. Defaults to the ModernBERT SEP token.")] = 50282,
+    pad_token_id: Annotated[int, Option(help="ID of the PAD token. Defaults to the ModernBERT PAD token.")] = 50283,
+    mask_token_id: Annotated[int, Option(help="ID of the MASK token. Defaults to the ModernBERT MASK token.")] = 50284,
+    max_length: Annotated[int, Option(help="Maximum length of the input sequence. Defaults to the final ModernBERT sequence length.")] = 8192,
+    torch_dtype: Annotated[TorchDtype, Option(help="Torch dtype to use for the model.")] = TorchDtype.float32,
+    pytorch_bin: Annotated[bool, Option(help="Save weights as a pytorch_model.bin file.")] = True,
+    safetensors: Annotated[bool, Option(help="Save weights as a model.safetensors file.")] = True,
+    drop_tied_decoder_weights: Annotated[bool, Option(help="Don't save the wieght tied decoder weights.")] = True,
+):  # fmt: skip
+    """
+    Convert a ModernBERT Composer checkpoint to HuggingFace pretrained format.
+    """
+    target_path = f"{output_dir}/{output_name}"
+    write_huggingface_pretrained_from_composer_checkpoint(input_checkpoint, target_path)
+
+    # Process pytorch_model.bin
+    state_dict_path = f"{target_path}/pytorch_model.bin"
+    state_dict = torch.load(state_dict_path, map_location=torch.device("cpu"))
+    var_map = (
+        (re.compile(r"encoder\.layers\.(.*)"), r"layers.\1"),
+        (re.compile(r"^bert\.(.*)"), r"model.\1"),  # Replaces 'bert.' with 'model.' at the start of keys
+    )
+    for pattern, replacement in var_map:
+        state_dict = {re.sub(pattern, replacement, name): tensor for name, tensor in state_dict.items()}
+
+    # Update config.json
+    config_json_path = f"{target_path}/config.json"
+    with open(config_json_path, "r") as f:
+        config_dict = json.load(f)
+        config_dict = update_config(
+            config_dict, bos_token_id, eos_token_id, cls_token_id, pad_token_id, sep_token_id, max_length, torch_dtype
+        )
+    with open(config_json_path, "w") as f:
+        json.dump(config_dict, f, indent=2)
+
+    if config_dict.get("tie_word_embeddings", False) and drop_tied_decoder_weights:
+        if "decoder.weight" in state_dict:
+            del state_dict["decoder.weight"]
+
+    # Export to pytorch_model.bin
+    if pytorch_bin:
+        torch.save(state_dict, state_dict_path)
+
+    # Export to safetensors
+    if safetensors:
+        safetensors_path = f"{target_path}/model.safetensors"
+        safetensors_save_file(state_dict, safetensors_path)
+
+    # Update tokenizer_config.json
+    tokenizer_config_path = f"{target_path}/tokenizer_config.json"
+    with open(tokenizer_config_path, "r") as f:
+        config_dict = json.load(f)
+    config_dict["model_max_length"] = max_length
+    config_dict["added_tokens_decoder"][str(mask_token_id)]["lstrip"] = True
+    config_dict["model_input_names"] = ["input_ids", "attention_mask"]
+    config_dict["tokenizer_class"] = "PreTrainedTokenizerFast"
+
+    if "extra_special_tokens" in config_dict:
+        del config_dict["extra_special_tokens"]
+    with open(tokenizer_config_path, "w") as f:
+        json.dump(config_dict, f, indent=2)
+
+    # Update special_tokens_map.json
+    special_tokens_path = f"{target_path}/special_tokens_map.json"
+    with open(special_tokens_path, "r") as f:
+        config_dict = json.load(f)
+    config_dict["mask_token"]["lstrip"] = True
+    with open(special_tokens_path, "w") as f:
+        json.dump(config_dict, f, indent=2)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/yamls/modernbert/modernbert-base-context-extension.yaml b/yamls/modernbert/modernbert-base-context-extension.yaml
new file mode 100644
index 00000000..9cf77d9a
--- /dev/null
+++ b/yamls/modernbert/modernbert-base-context-extension.yaml
@@ -0,0 +1,159 @@
+data_local: data_folder
+data_remote: # If blank, files must be present in data_local
+
+pretrain_data_local: pretrain_data_folder # set this to use pretraining data for validation metrics
+pretrain_data_remote: # If blank, files must be present in pretrain_data_local
+
+max_seq_len: 8192
+tokenizer_name: answerdotai/ModernBERT-base
+mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
+count_padding_tokens: false
+
+# Run Name
+run_name: modernbert-base-context-extension
+pretrain_run_name: modernbert-base-pretrain
+
+# Model
+model:
+  name: flex_bert
+  pretrained_model_name: bert-base-uncased # has to be set to bert-base-uncased legacy reasons
+  tokenizer_name: ${tokenizer_name}
+  disable_train_metrics: true # save some time by not computing metrics on the training set
+  model_config:
+    vocab_size: 50368
+    init_method: full_megatron
+    num_hidden_layers: 22
+    hidden_size: 768
+    intermediate_size: 1152
+    num_attention_heads: 12 # to have head size of 64
+    attention_layer: rope
+    attention_probs_dropout_prob: 0.0
+    attn_out_bias: false
+    attn_out_dropout_prob: 0.1
+    attn_qkv_bias: false
+    bert_layer: prenorm
+    embed_dropout_prob: 0.0
+    embed_norm: true
+    final_norm: true
+    skip_first_prenorm: true
+    embedding_layer: sans_pos
+    loss_function: fa_cross_entropy
+    loss_kwargs:
+      reduction: mean
+    mlp_dropout_prob: 0.0
+    mlp_in_bias: false
+    mlp_layer: glu
+    mlp_out_bias: false
+    normalization: layernorm
+    norm_kwargs:
+      eps: 1e-5
+      bias: false
+    hidden_act: gelu
+    head_pred_act: gelu
+    activation_function: gelu # better safe than sorry
+    padding: unpadded
+    rotary_emb_dim: null
+    rotary_emb_base: 160000.0
+    rotary_emb_scale_base: null
+    rotary_emb_interleaved: false
+    local_attn_rotary_emb_base: 10000.0
+    local_attn_rotary_emb_dim: null
+    allow_embedding_resizing: true
+    sliding_window: 128
+    global_attn_every_n_layers: 3
+    unpad_embeddings: true
+    compile_model: true
+    masked_prediction: true
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split:
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: true
+    mlm_probability: ${mlm_probability}
+    streaming: false
+    shuffle_seed: 2998
+  drop_last: true
+  num_workers: 6
+  sequence_packing: true
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${pretrain_data_local}
+    remote: ${pretrain_data_remote}
+    split: validation
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: false
+    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
+    streaming: false
+  drop_last: false
+  num_workers: 3
+  sequence_packing: false
+
+
+# Optimization
+scheduler:
+  name: constant_with_warmup
+  t_warmup: 0tok
+  t_max: ${max_duration}
+
+optimizer:
+  name: decoupled_stableadamw
+  lr: 3e-4 # Peak learning rate
+  betas:
+  - 0.9
+  - 0.98
+  eps: 1.0e-06
+  weight_decay: 1.0e-5 # Amount of weight decay regularization
+  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
+  log_grad_norm: true
+
+max_duration: 250_000_000_000tok
+eval_interval: 4000ba
+global_train_batch_size: 576
+global_eval_batch_size: 1024
+
+# System
+seed: 17
+device_eval_batch_size: 128
+device_train_microbatch_size: 12
+precision: amp_bf16
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 500ba
+
+callbacks:
+  speed_monitor:
+    window_size: 50
+  lr_monitor: {}
+  scheduled_gc: {}
+  log_grad_norm:
+    batch_log_interval: 10
+  packing_efficiency:
+    log_interval: 10
+
+# W&B logging
+# loggers:
+#   wandb:
+#     project:
+#     entity:
+
+save_interval: 4000ba
+save_num_checkpoints_to_keep: -1  # Important, this cleans up checkpoints saved to DISK
+save_folder: checkpoints/{run_name}
+
+# Load from local filesystem or remote object store to
+load_path: checkpoints/{pretrain_run_name}/latest-rank0.pt
+
+autoresume: false
+reset_time: true # restarts the scheduler, dataloaders, etc from step zero
+restart_override: true # resets optimizer hyperparameters (LR, WD, etc), LR Scheduler, and training microbatch size from the checkpoint's values
\ No newline at end of file
diff --git a/yamls/modernbert/modernbert-base-learning-rate-decay.yaml b/yamls/modernbert/modernbert-base-learning-rate-decay.yaml
new file mode 100644
index 00000000..492b15c4
--- /dev/null
+++ b/yamls/modernbert/modernbert-base-learning-rate-decay.yaml
@@ -0,0 +1,160 @@
+data_local: data_folder
+data_remote: # If blank, files must be present in data_local
+
+pretrain_data_local: pretrain_data_folder # set this to use pretraining data for validation metrics
+pretrain_data_remote: # If blank, files must be present in pretrain_data_local
+
+max_seq_len: 8192
+tokenizer_name: answerdotai/ModernBERT-base
+mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
+count_padding_tokens: false
+
+# Run Name
+run_name: modernbert-base-learning-rate-decay
+context_extension_run_name: modernbert-base-context-extension
+
+# Model
+model:
+  name: flex_bert
+  pretrained_model_name: bert-base-uncased # has to be set to bert-base-uncased legacy reasons
+  tokenizer_name: ${tokenizer_name}
+  disable_train_metrics: true # save some time by not computing metrics on the training set
+  model_config:
+    vocab_size: 50368
+    init_method: full_megatron
+    num_hidden_layers: 22
+    hidden_size: 768
+    intermediate_size: 1152
+    num_attention_heads: 12 # to have head size of 64
+    attention_layer: rope
+    attention_probs_dropout_prob: 0.0
+    attn_out_bias: false
+    attn_out_dropout_prob: 0.1
+    attn_qkv_bias: false
+    bert_layer: prenorm
+    embed_dropout_prob: 0.0
+    embed_norm: true
+    final_norm: true
+    skip_first_prenorm: true
+    embedding_layer: sans_pos
+    loss_function: fa_cross_entropy
+    loss_kwargs:
+      reduction: mean
+    mlp_dropout_prob: 0.0
+    mlp_in_bias: false
+    mlp_layer: glu
+    mlp_out_bias: false
+    normalization: layernorm
+    norm_kwargs:
+      eps: 1e-5
+      bias: false
+    hidden_act: gelu
+    head_pred_act: gelu
+    activation_function: gelu # better safe than sorry
+    padding: unpadded
+    rotary_emb_dim: null
+    rotary_emb_base: 160000.0
+    rotary_emb_scale_base: null
+    rotary_emb_interleaved: false
+    local_attn_rotary_emb_base: 10000.0
+    local_attn_rotary_emb_dim: null
+    allow_embedding_resizing: true
+    sliding_window: 128
+    global_attn_every_n_layers: 3
+    unpad_embeddings: true
+    compile_model: true
+    masked_prediction: true
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split:
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: true
+    mlm_probability: ${mlm_probability}
+    streaming: false
+    shuffle_seed: 2998
+  drop_last: true
+  num_workers: 6
+  sequence_packing: true
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${pretrain_data_local}
+    remote: ${pretrain_data_remote}
+    split: validation
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: false
+    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
+    streaming: false
+  drop_last: false
+  num_workers: 3
+  sequence_packing: false
+
+
+# Optimization
+scheduler:
+  name: one_minus_sqrt
+  alpha_f: 0.001
+  t_decay: ${max_duration}
+  t_max: ${max_duration}
+
+optimizer:
+  name: decoupled_stableadamw
+  lr: 3e-4 # Peak learning rate
+  betas:
+  - 0.9
+  - 0.98
+  eps: 1.0e-06
+  weight_decay: 1.0e-5 # Amount of weight decay regularization
+  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
+  log_grad_norm: true
+
+max_duration: 50_000_000_000tok
+eval_interval: 1000ba
+global_train_batch_size: 576
+global_eval_batch_size: 1024
+
+# System
+seed: 17
+device_eval_batch_size: 128
+device_train_microbatch_size: 12
+precision: amp_bf16
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 500ba
+
+callbacks:
+  speed_monitor:
+    window_size: 50
+  lr_monitor: {}
+  scheduled_gc: {}
+  log_grad_norm:
+    batch_log_interval: 10
+  packing_efficiency:
+    log_interval: 10
+
+# W&B logging
+# loggers:
+#   wandb:
+#     project:
+#     entity:
+
+save_interval: 1000ba
+save_num_checkpoints_to_keep: -1  # Important, this cleans up checkpoints saved to DISK
+save_folder: checkpoints/{run_name}
+
+# Load from local filesystem or remote object store to
+load_path: checkpoints/{context_extension_run_name}/latest-rank0.pt
+
+autoresume: false
+reset_time: true # restarts the scheduler, dataloaders, etc from step zero
+restart_override: true # resets optimizer hyperparameters (LR, WD, etc), LR Scheduler, and training microbatch size from the checkpoint's values
\ No newline at end of file
diff --git a/yamls/modernbert/modernbert-base-pretrain.yaml b/yamls/modernbert/modernbert-base-pretrain.yaml
new file mode 100644
index 00000000..9302c598
--- /dev/null
+++ b/yamls/modernbert/modernbert-base-pretrain.yaml
@@ -0,0 +1,153 @@
+data_local: data_folder
+data_remote: # If blank, files must be present in data_local
+
+max_seq_len: 1024
+tokenizer_name: answerdotai/ModernBERT-base
+mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
+count_padding_tokens: false
+
+# Run Name
+run_name: modernbert-base-pretrain
+
+# Model
+model:
+  name: flex_bert
+  pretrained_model_name: bert-base-uncased # has to be set to bert-base-uncased legacy reasons
+  tokenizer_name: ${tokenizer_name}
+  disable_train_metrics: true # save some time by not computing metrics on the training set
+  model_config:
+    vocab_size: 50368
+    init_method: full_megatron
+    num_hidden_layers: 22
+    hidden_size: 768
+    intermediate_size: 1152
+    num_attention_heads: 12 # to have head size of 64
+    attention_layer: rope
+    attention_probs_dropout_prob: 0.0
+    attn_out_bias: false
+    attn_out_dropout_prob: 0.1
+    attn_qkv_bias: false
+    bert_layer: prenorm
+    embed_dropout_prob: 0.0
+    embed_norm: true
+    final_norm: true
+    skip_first_prenorm: true
+    embedding_layer: sans_pos
+    loss_function: fa_cross_entropy
+    loss_kwargs:
+      reduction: mean
+    mlp_dropout_prob: 0.0
+    mlp_in_bias: false
+    mlp_layer: glu
+    mlp_out_bias: false
+    normalization: layernorm
+    norm_kwargs:
+      eps: 1e-5
+      bias: false
+    hidden_act: gelu
+    head_pred_act: gelu
+    activation_function: gelu # better safe than sorry
+    padding: unpadded
+    rotary_emb_dim: null
+    rotary_emb_base: 10000.0
+    rotary_emb_scale_base: null
+    rotary_emb_interleaved: false
+    allow_embedding_resizing: true
+    sliding_window: 128
+    global_attn_every_n_layers: 3
+    unpad_embeddings: true
+    compile_model: true
+    masked_prediction: true
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split: train
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: true
+    mlm_probability: ${mlm_probability}
+    streaming: false
+  drop_last: true
+  num_workers: 6
+  sequence_packing: true
+  batch_size_warmup_min_size: ${device_train_microbatch_size}
+  batch_size_warmup_tokens: 50_000_000_000tok
+
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split: validation
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: false
+    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
+    streaming: false
+  drop_last: false
+  num_workers: 3
+  sequence_packing: false
+
+
+# Optimization
+scheduler:
+  name: warmup_stable_decay
+  t_warmup: 3_000_000_000tok
+  alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
+  t_decay: 0tok
+
+optimizer:
+  name: decoupled_stableadamw
+  lr: 8e-4 # Peak learning rate
+  betas:
+  - 0.9
+  - 0.98
+  eps: 1.0e-06
+  weight_decay: 1.0e-5 # Amount of weight decay regularization
+  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
+  log_grad_norm: true
+
+max_duration: 1_719_000_000_000tok
+eval_interval: 4000ba
+global_train_batch_size: 4608
+global_eval_batch_size: 1024
+
+# System
+seed: 17
+device_eval_batch_size: 128
+device_train_microbatch_size: 96
+precision: amp_bf16
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 100ba
+
+callbacks:
+  speed_monitor:
+    window_size: 100
+  lr_monitor: {}
+  scheduled_gc: {}
+  log_grad_norm:
+    batch_log_interval: 10
+  packing_efficiency:
+    log_interval: 10
+
+# W&B logging
+# loggers:
+#   wandb:
+#     project:
+#     entity:
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 4000ba
+save_num_checkpoints_to_keep: -1  # Important, this cleans up checkpoints saved to DISK
+save_folder: checkpoints/{run_name}
+
+# Load from local filesystem or remote object store to
+# load_path: null
\ No newline at end of file
diff --git a/yamls/modernbert/modernbert-large-context-extension.yaml b/yamls/modernbert/modernbert-large-context-extension.yaml
new file mode 100644
index 00000000..66e15e0f
--- /dev/null
+++ b/yamls/modernbert/modernbert-large-context-extension.yaml
@@ -0,0 +1,159 @@
+data_local: data_folder
+data_remote: # If blank, files must be present in data_local
+
+pretrain_data_local: pretrain_data_folder # set this to use pretraining data for validation metrics
+pretrain_data_remote: # If blank, files must be present in pretrain_data_local
+
+max_seq_len: 8192
+tokenizer_name: answerdotai/ModernBERT-large
+mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
+count_padding_tokens: false
+
+# Run Name
+run_name: modernbert-large-context-extension
+pretrain_run_name: modernbert-large-pretrain
+
+# Model
+model:
+  name: flex_bert
+  pretrained_model_name: bert-base-uncased # has to be set to bert-base-uncased legacy reasons
+  tokenizer_name: ${tokenizer_name}
+  disable_train_metrics: true # save some time by not computing metrics on the training set
+  model_config:
+    vocab_size: 50368
+    init_method: full_megatron
+    num_hidden_layers: 28
+    hidden_size: 1024
+    intermediate_size: 2624
+    num_attention_heads: 16 # to have head size of 64
+    attention_layer: rope
+    attention_probs_dropout_prob: 0.0
+    attn_out_bias: false
+    attn_out_dropout_prob: 0.1
+    attn_qkv_bias: false
+    bert_layer: prenorm
+    embed_dropout_prob: 0.0
+    embed_norm: true
+    final_norm: true
+    skip_first_prenorm: true
+    embedding_layer: sans_pos
+    loss_function: fa_cross_entropy
+    loss_kwargs:
+      reduction: mean
+    mlp_dropout_prob: 0.0
+    mlp_in_bias: false
+    mlp_layer: glu
+    mlp_out_bias: false
+    normalization: layernorm
+    norm_kwargs:
+      eps: 1e-5
+      bias: false
+    hidden_act: gelu
+    head_pred_act: gelu
+    activation_function: gelu # better safe than sorry
+    padding: unpadded
+    rotary_emb_dim: null
+    rotary_emb_base: 160000.0
+    rotary_emb_scale_base: null
+    rotary_emb_interleaved: false
+    local_attn_rotary_emb_base: 10000.0
+    local_attn_rotary_emb_dim: null
+    allow_embedding_resizing: true
+    sliding_window: 128
+    global_attn_every_n_layers: 3
+    unpad_embeddings: true
+    compile_model: true
+    masked_prediction: true
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split:
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: true
+    mlm_probability: ${mlm_probability}
+    streaming: false
+    shuffle_seed: 2998
+  drop_last: true
+  num_workers: 6
+  sequence_packing: true
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${pretrain_data_local}
+    remote: ${pretrain_data_remote}
+    split: validation
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: false
+    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
+    streaming: false
+  drop_last: false
+  num_workers: 3
+  sequence_packing: false
+
+
+# Optimization
+scheduler:
+  name: constant_with_warmup
+  t_warmup: 0tok
+  t_max: ${max_duration}
+
+optimizer:
+  name: decoupled_stableadamw
+  lr: 5e-5 # Peak learning rate. Note that this is after the pretraining LR reduction
+  betas:
+  - 0.9
+  - 0.98
+  eps: 1.0e-06
+  weight_decay: 1.0e-6 # Amount of weight decay regularization. Note that this is after the pretraining LR reduction
+  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
+  log_grad_norm: true
+
+max_duration: 250_000_000_000tok
+eval_interval: 4000ba
+global_train_batch_size: 616
+global_eval_batch_size: 616
+
+# System
+seed: 314
+device_eval_microbatch_size: 14
+device_train_microbatch_size: 7
+precision: amp_bf16
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 500ba
+
+callbacks:
+  speed_monitor:
+    window_size: 50
+  lr_monitor: {}
+  scheduled_gc: {}
+  log_grad_norm:
+    batch_log_interval: 10
+  packing_efficiency:
+    log_interval: 10
+
+# W&B logging
+# loggers:
+#   wandb:
+#     project:
+#     entity:
+
+save_interval: 4000ba
+save_num_checkpoints_to_keep: -1  # Important, this cleans up checkpoints saved to DISK
+save_folder: checkpoints/{run_name}
+
+# Load from local filesystem or remote object store to
+load_path: checkpoints/{pretrain_run_name}/latest-rank0.pt
+
+autoresume: false
+reset_time: true # restarts the scheduler, dataloaders, etc from step zero
+restart_override: true # resets optimizer hyperparameters (LR, WD, etc), LR Scheduler, and training microbatch size from the checkpoint's values
\ No newline at end of file
diff --git a/yamls/modernbert/modernbert-large-learning-rate-decay.yaml b/yamls/modernbert/modernbert-large-learning-rate-decay.yaml
new file mode 100644
index 00000000..15bdcc4d
--- /dev/null
+++ b/yamls/modernbert/modernbert-large-learning-rate-decay.yaml
@@ -0,0 +1,160 @@
+data_local: data_folder
+data_remote: # If blank, files must be present in data_local
+
+pretrain_data_local: pretrain_data_folder # set this to use pretraining data for validation metrics
+pretrain_data_remote: # If blank, files must be present in pretrain_data_local
+
+max_seq_len: 8192
+tokenizer_name: answerdotai/ModernBERT-large
+mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
+count_padding_tokens: false
+
+# Run Name
+run_name: modernbert-large-learning-rate-decay
+context_extension_run_name: modernbert-large-context-extension
+
+# Model
+model:
+  name: flex_bert
+  pretrained_model_name: bert-base-uncased # has to be set to bert-base-uncased legacy reasons
+  tokenizer_name: ${tokenizer_name}
+  disable_train_metrics: true # save some time by not computing metrics on the training set
+  model_config:
+    vocab_size: 50368
+    init_method: full_megatron
+    num_hidden_layers: 28
+    hidden_size: 1024
+    intermediate_size: 2624
+    num_attention_heads: 16 # to have head size of 64
+    attention_layer: rope
+    attention_probs_dropout_prob: 0.0
+    attn_out_bias: false
+    attn_out_dropout_prob: 0.1
+    attn_qkv_bias: false
+    bert_layer: prenorm
+    embed_dropout_prob: 0.0
+    embed_norm: true
+    final_norm: true
+    skip_first_prenorm: true
+    embedding_layer: sans_pos
+    loss_function: fa_cross_entropy
+    loss_kwargs:
+      reduction: mean
+    mlp_dropout_prob: 0.0
+    mlp_in_bias: false
+    mlp_layer: glu
+    mlp_out_bias: false
+    normalization: layernorm
+    norm_kwargs:
+      eps: 1e-5
+      bias: false
+    hidden_act: gelu
+    head_pred_act: gelu
+    activation_function: gelu # better safe than sorry
+    padding: unpadded
+    rotary_emb_dim: null
+    rotary_emb_base: 160000.0
+    rotary_emb_scale_base: null
+    rotary_emb_interleaved: false
+    local_attn_rotary_emb_base: 10000.0
+    local_attn_rotary_emb_dim: null
+    allow_embedding_resizing: true
+    sliding_window: 128
+    global_attn_every_n_layers: 3
+    unpad_embeddings: true
+    compile_model: true
+    masked_prediction: true
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split:
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: true
+    mlm_probability: ${mlm_probability}
+    streaming: false
+    shuffle_seed: 2998
+  drop_last: true
+  num_workers: 6
+  sequence_packing: true
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${pretrain_data_local}
+    remote: ${pretrain_data_remote}
+    split: validation
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: false
+    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
+    streaming: false
+  drop_last: false
+  num_workers: 3
+  sequence_packing: false
+
+
+# Optimization
+scheduler:
+  name: one_minus_sqrt
+  alpha_f: 0.001
+  t_decay: ${max_duration}
+  t_max: ${max_duration}
+
+optimizer:
+  name: decoupled_stableadamw
+  lr: 5e-5 # Peak learning rate. Note that this is after the pretraining LR reduction
+  betas:
+  - 0.9
+  - 0.98
+  eps: 1.0e-06
+  weight_decay: 1.0e-6 # Amount of weight decay regularization. Note that this is after the pretraining LR reduction
+  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
+  log_grad_norm: true
+
+max_duration: 50_000_000_000tok
+eval_interval: 1000ba
+global_train_batch_size: 624
+global_eval_batch_size: 624
+
+# System
+seed: 314
+device_eval_microbatch_size: 6
+device_train_microbatch_size: 6
+precision: amp_bf16
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 500ba
+
+callbacks:
+  speed_monitor:
+    window_size: 50
+  lr_monitor: {}
+  scheduled_gc: {}
+  log_grad_norm:
+    batch_log_interval: 10
+  packing_efficiency:
+    log_interval: 10
+
+# W&B logging
+# loggers:
+#   wandb:
+#     project:
+#     entity:
+
+save_interval: 1000ba
+save_num_checkpoints_to_keep: -1  # Important, this cleans up checkpoints saved to DISK
+save_folder: checkpoints/{run_name}
+
+# Load from local filesystem or remote object store to
+load_path: checkpoints/{context_extension_run_name}/latest-rank0.pt
+
+autoresume: false
+reset_time: true # restarts the scheduler, dataloaders, etc from step zero
+restart_override: true # resets optimizer hyperparameters (LR, WD, etc), LR Scheduler, and training microbatch size from the checkpoint's values
\ No newline at end of file
diff --git a/yamls/modernbert/modernbert-large-pretrain.yaml b/yamls/modernbert/modernbert-large-pretrain.yaml
new file mode 100644
index 00000000..b4d9e1a6
--- /dev/null
+++ b/yamls/modernbert/modernbert-large-pretrain.yaml
@@ -0,0 +1,160 @@
+data_local: data_folder
+data_remote: # If blank, files must be present in data_local
+
+max_seq_len: 1024
+tokenizer_name: answerdotai/ModernBERT-large
+mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
+count_padding_tokens: false
+
+# Run Name
+run_name: modernbert-large-pretrain
+
+# Model
+model:
+  name: flex_bert
+  pretrained_model_name: bert-base-uncased # has to be set to bert-base-uncased legacy reasons
+  tokenizer_name: ${tokenizer_name}
+  disable_train_metrics: true # save some time by not computing metrics on the training set
+  model_config:
+    vocab_size: 50368
+    init_method: full_megatron
+    num_hidden_layers: 28
+    hidden_size: 1024
+    intermediate_size: 2624
+    num_attention_heads: 16 # to have head size of 64
+    attention_layer: rope
+    attention_probs_dropout_prob: 0.0
+    attn_out_bias: false
+    attn_out_dropout_prob: 0.1
+    attn_qkv_bias: false
+    bert_layer: prenorm
+    embed_dropout_prob: 0.0
+    embed_norm: true
+    final_norm: true
+    skip_first_prenorm: true
+    embedding_layer: sans_pos
+    loss_function: fa_cross_entropy
+    loss_kwargs:
+      reduction: mean
+    mlp_dropout_prob: 0.0
+    mlp_in_bias: false
+    mlp_layer: glu
+    mlp_out_bias: false
+    normalization: layernorm
+    norm_kwargs:
+      eps: 1e-5
+      bias: false
+    hidden_act: gelu
+    head_pred_act: gelu
+    activation_function: gelu # better safe than sorry
+    padding: unpadded
+    rotary_emb_dim: null
+    rotary_emb_base: 10000.0
+    rotary_emb_scale_base: null
+    rotary_emb_interleaved: false
+    allow_embedding_resizing: true
+    sliding_window: 128
+    global_attn_every_n_layers: 3
+    unpad_embeddings: true
+    compile_model: true
+    masked_prediction: true
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split: train
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: true
+    mlm_probability: ${mlm_probability}
+    streaming: false
+  drop_last: true
+  num_workers: 6
+  sequence_packing: true
+  batch_size_warmup_min_size: ${device_train_microbatch_size}
+  batch_size_warmup_tokens: 10_000_000_000tok
+
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${data_local}
+    remote: ${data_remote}
+    split: validation
+    tokenizer_name: ${tokenizer_name}
+    max_seq_len: ${max_seq_len}
+    shuffle: false
+    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
+    streaming: false
+  drop_last: false
+  num_workers: 3
+  sequence_packing: false
+
+
+# Optimization
+scheduler:
+  name: warmup_stable_decay
+  t_warmup: 2_000_000_000tok
+  alpha_f: 0.00
+  t_decay: 0tok
+
+optimizer:
+  name: decoupled_stableadamw
+  lr: 5e-4 # Peak learning rate. This learning rate was too high and required a restart. You'll want to reduce it.
+  betas:
+  - 0.9
+  - 0.98
+  eps: 1.0e-06
+  weight_decay: 1.0e-5 # Amount of weight decay regularization
+  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
+  log_grad_norm: true
+
+max_duration: 1_719_000_000_000tok
+eval_interval: 4000ba
+global_train_batch_size: 4928
+global_eval_batch_size: 896
+
+# Initialize ModernBERT-large from ModernBERT-base weights
+init_from_checkpoint:
+  checkpoint_cfg: modernbert-base-pretrain.yaml
+  checkpoint_load_path: checkpoints # don't include a trailing slash
+  checkpoint_run_name: modernbert-base-pretrain
+  mode: tile_weights_from_middle
+
+# System
+seed: 314
+device_eval_batch_size: 112
+device_train_microbatch_size: 56
+precision: amp_bf16
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 100ba
+
+callbacks:
+  speed_monitor:
+    window_size: 100
+  lr_monitor: {}
+  scheduled_gc: {}
+  log_grad_norm:
+    batch_log_interval: 10
+  packing_efficiency:
+    log_interval: 10
+
+# W&B logging
+# loggers:
+#   wandb:
+#     project:
+#     entity:
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 4000ba
+save_num_checkpoints_to_keep: -1  # Important, this cleans up checkpoints saved to DISK
+save_folder: checkpoints/{run_name}
+
+# Load from local filesystem or remote object store to
+# load_path: null
\ No newline at end of file