Evo2 spike-no-more support (#1011)

jstjohn · web-flow · commit a5a2af931f96 · 2025-08-05T16:58:36.000Z
### Description
Spike-no-more support training for evo2 models.

### Type of changes
- [ ]  Bug fix (non-breaking change which fixes an issue)
- [x]  New feature (non-breaking change which adds functionality)
- [ ]  Refactor
- [ ]  Documentation update
- [ ]  Other (please describe):

---------

Signed-off-by: John St John &lt;jstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/models/mamba.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/models/mamba.py
@@ -335,6 +335,7 @@ class HybridMambaConfig8BEvo2Loss(NemotronHConfigBase):
     # to be close to a target value (1.0).
     use_targeted_variance_loss: bool = False
     targeted_variance_loss_loss_coeff: float = 0.1
+    share_embeddings_and_output_weights: bool = False
 
     def __post_init__(self):
         """Post-init logic for Evo2 to enable backwards compatibility with old configs."""
@@ -378,6 +379,7 @@ def configure_model(
             seq_len_interpolation_factor=self.seq_len_interpolation_factor,
             pre_process=pre_process or parallel_state.is_pipeline_first_stage(),
             post_process=post_process or parallel_state.is_pipeline_last_stage(),
+            share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
         )
 
 
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -48,6 +48,7 @@
 
 from bionemo.evo2.models.mamba import MAMBA_MODEL_OPTIONS, MambaModel, mamba_no_weight_decay_cond_with_embeddings
 from bionemo.evo2.run.peft import Evo2LoRA
+from bionemo.evo2.utils.config import hyena_no_weight_decay_cond_with_embeddings
 from bionemo.evo2.utils.logging.callbacks import TEVCallback
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
@@ -176,7 +177,6 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         help="TP communication backend to use. Defaults to 'nccl'.",
     )
     parser.add_argument("--align-param-gather", action="store_true", default=False)
-    # parser.add_argument("--straggler-detection", action="store_true", default=False)
     parser.add_argument(
         "--model-size",
         type=str,
@@ -356,7 +356,8 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         help="If set, the embeddings are initialized with a Normal(0, 1.0) distribution rather "
         "than the default Normal(0, 0.02). This may help avoid loss spiking during training. Consider using this with "
         "--no-weight-decay-embeddings to avoid shrinking the embeddings to 0 by skipping weight decay on these layers, "
-        "or with --use-targeted-variance-loss to maintain a 1.0 variance during training even with weight decay.",
+        "or with --use-targeted-variance-loss to maintain a 1.0 variance during training even with weight decay. This "
+        "also turns off shared weights between embeddings and outputs.",
     )
     parser.add_argument(
         "--no-weight-decay-embeddings",
@@ -442,6 +443,12 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         default=0.0,
         help="Dropout probability for the hyena layers",
     )
+    parser.add_argument(
+        "--ffn-hidden-size",
+        type=int,
+        default=None,
+        help="FFN hidden size for the hyena layers",
+    )
     parser.add_argument(
         "--log-num-zeros-in-grad",
         action="store_true",
@@ -549,7 +556,6 @@ def train(args: argparse.Namespace) -> nl.Trainer:
             tokenizer=tokenizer,
             eod_mask_loss=args.eod_pad_in_loss_mask,
         )
-
     if args.no_activation_checkpointing:
         activation_checkpointing_args = {
             "recompute_granularity": None,
@@ -583,6 +589,12 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         "add_bias_output": args.add_bias_output,
         **activation_checkpointing_args,
     }
+    if args.spike_no_more_embedding_init:
+        config_modifiers_init["embedding_init_method_std"] = 1.0
+        # When using spike_no_more_embedding_init, we don't want to share embeddings and outputs.
+        config_modifiers_init["share_embeddings_and_output_weights"] = False
+    if args.ffn_hidden_size:
+        config_modifiers_init["ffn_hidden_size"] = args.ffn_hidden_size
     if args.use_targeted_variance_loss:
         config_modifiers_init["use_targeted_variance_loss"] = True
     if args.use_b2b_causal_conv1d:
@@ -603,6 +615,10 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         if args.model_size not in HYENA_MODEL_OPTIONS:
             raise ValueError(f"Invalid model size for Hyena: {args.model_size}")
         model_config = HYENA_MODEL_OPTIONS[args.model_size](**config_modifiers_init)
+        if args.no_weight_decay_embeddings:
+            # Override the default weight decay condition for Hyena with our bionemo version that also excludes
+            #  embeddings
+            model_config.hyena_no_weight_decay_cond_fn = hyena_no_weight_decay_cond_with_embeddings
         # Lora adaptors configuration
         lora_transform = None
         if args.lora_finetune:
@@ -612,8 +628,6 @@ def train(args: argparse.Namespace) -> nl.Trainer:
     else:  # mamba
         if args.no_weight_decay_embeddings:
             config_modifiers_init["hyena_no_weight_decay_cond_fn"] = mamba_no_weight_decay_cond_with_embeddings
-        if args.spike_no_more_embedding_init:  # --spike-no-more-embedding-init
-            config_modifiers_init["spike_no_more_embedding_init"] = True
         config_modifiers_init["lowercase_loss_reweighting"] = args.mamba_lowercase_loss_weight
         if args.model_size not in MAMBA_MODEL_OPTIONS:
             raise ValueError(f"Invalid model size for Mamba: {args.model_size}")
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/config.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/config.py
@@ -20,9 +20,17 @@
 from pathlib import Path
 from typing import Literal
 
+from nemo.collections.llm.gpt.model.megatron.hyena.hyena_utils import hyena_no_weight_decay_cond
 from pydantic import BaseModel
 
 
+def hyena_no_weight_decay_cond_with_embeddings(name, param):
+    """Condition for no weight decay for Hyena parameters with embeddings."""
+    if "embedding" in name:
+        return True
+    return hyena_no_weight_decay_cond(name, param)
+
+
 class Evo2TaxonomyLineage(BaseModel):
     """Pydantic model class that defines the source lineage of a DNA sequence."""