Genentech · avantikalal · Mar 25, 2026 · Mar 19, 2026 · Mar 25, 2026
diff --git a/src/grelu/data/preprocess.py b/src/grelu/data/preprocess.py
@@ -560,7 +560,7 @@ def get_gc_matched_intervals(
     from grelu.sequence.utils import get_unique_length
 
     genome_obj = get_genome(genome)
-    chroms = get_chromosomes(chroms)
+    chroms = get_chromosomes(chroms, genome=genome_obj)
 
     # Get seq_len
     seq_len = get_unique_length(intervals)
@@ -767,7 +767,7 @@ def make_insertion_bigwig(
     filter_cmd = (
         ""
         if chroms is None
-        else f"""grep {"".join([f"-e ^{chrom} " for chrom in get_chromosomes(chroms)])} | """
+        else f"""grep {"".join([f"-e ^{chrom} " for chrom in get_chromosomes(chroms, genome=genome)])} | """
     )
     bedgraph_cmd = f"bedtools genomecov -bg -5 -i stdin -g {genome.sizes_file} | "
     sort_cmd = f"bedtools sort -i stdin > {bedgraph_file}"

diff --git a/src/grelu/data/utils.py b/src/grelu/data/utils.py
@@ -2,6 +2,7 @@
 `grelu.data.utils` contains Dataset-related utility functions.
 """
 
+import re
 from typing import List, Optional, Union
 
 import numpy as np
@@ -33,36 +34,54 @@ def _create_task_data(task_names: List[str]) -> pd.DataFrame:
     return pd.DataFrame(index=task_names)
 
 
-def get_chromosomes(chroms: Union[str, List[str]]) -> List[str]:
+def get_chromosomes(chroms: Union[str, List[str]], genome=None) -> List[str]:
     """
     Return a list of chromosomes given shortcut names.
 
     Args:
-        chroms: The chromosome name(s) or shortcut name(s).
+        chroms: The chromosome name(s) or shortcut name(s). Supported
+            shortcuts are "autosomes", "autosomesX", and "autosomesXY".
+        genome: A genome object with a ``sizes_file`` attribute (e.g.
+            from ``get_genome()``). When provided, chromosome names are
+            read from the genome instead of using hardcoded human defaults.
 
     Returns:
         A list of chromosome name(s).
-
-    Example:
-        >>> get_chromosomes("autosomes")
-        ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
-        'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
-        'chr20', 'chr21', 'chr22']
     """
-    # Define shortcuts for chromosome names
-    chrom_shortcuts = {
-        "autosomes": ["chr" + str(x) for x in range(1, 23)],
-        "autosomesX": ["chr" + str(x) for x in range(1, 23)] + ["chrX"],
-        "autosomesXY": ["chr" + str(x) for x in range(1, 23)] + ["chrX", "chrY"],
-    }
-
-    # Return the corresponding chromosome names if a shortcut name is given
-    if isinstance(chroms, str) and (chroms in chrom_shortcuts):
-        return chrom_shortcuts[chroms]
-
-    # Return the chromosome names if they are given directly
-    else:
-        return chroms
+    shortcuts = {"autosomes", "autosomesX", "autosomesXY"}
+
+    if isinstance(chroms, str) and chroms in shortcuts:
+        if genome is not None:
+            # Read actual chromosome names from the genome
+            sizes = pd.read_table(
+                genome.sizes_file,
+                header=None,
+                names=["chrom", "size"],
+                dtype={"chrom": str, "size": int},
+            )
+            all_chroms = set(sizes["chrom"])
+            autosomes = sorted(
+                [c for c in all_chroms if re.match(r"^chr\d+$", c)],
+                key=lambda c: int(c[3:]),
+            )
+            if chroms == "autosomes":
+                return autosomes
+            elif chroms == "autosomesX":
+                return autosomes + (["chrX"] if "chrX" in all_chroms else [])
+            else:  # autosomesXY
+                extras = [c for c in ["chrX", "chrY"] if c in all_chroms]
+                return autosomes + extras
+        else:
+            # Fall back to hardcoded human chromosomes
+            human_autosomes = ["chr" + str(x) for x in range(1, 23)]
+            if chroms == "autosomes":
+                return human_autosomes
+            elif chroms == "autosomesX":
+                return human_autosomes + ["chrX"]
+            else:  # autosomesXY
+                return human_autosomes + ["chrX", "chrY"]
+
+    return chroms
 
 
 def _tile_positions(

diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
@@ -1,27 +1,67 @@
+import os
+import tempfile
+
 import numpy as np
 import pandas as pd
 import pytest
 
 from grelu.data.utils import _check_multiclass, _create_task_data, get_chromosomes
 
 
+class _FakeGenome:
+    """Minimal genome object with a sizes_file for testing."""
+
+    def __init__(self, chroms):
+        self._tmpfile = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".sizes", delete=False
+        )
+        for chrom in chroms:
+            self._tmpfile.write(f"{chrom}\t1000000\n")
+        self._tmpfile.flush()
+
+    @property
+    def sizes_file(self):
+        return self._tmpfile.name
+
+    def cleanup(self):
+        os.unlink(self._tmpfile.name)
+
+
 def test_get_chromosomes():
     # Test direct input
     assert get_chromosomes(["chr1", "chr2"]) == ["chr1", "chr2"]
 
-    # Test shortcut input
+    # Test shortcut input (no genome = human defaults)
     assert get_chromosomes("autosomes") == [f"chr{i}" for i in range(1, 23)]
 
-    # Test shortcut input
     assert get_chromosomes("autosomesX") == [f"chr{i}" for i in range(1, 23)] + ["chrX"]
 
-    # Test shortcut input
     assert get_chromosomes("autosomesXY") == [f"chr{i}" for i in range(1, 23)] + [
         "chrX",
         "chrY",
     ]
 
 
+def test_get_chromosomes_with_genome():
+    # Mouse genome (chr1-chr19 + chrX + chrY)
+    mouse_chroms = [f"chr{i}" for i in range(1, 20)] + ["chrX", "chrY", "chrM"]
+    genome = _FakeGenome(mouse_chroms)
+    try:
+        assert get_chromosomes("autosomes", genome=genome) == [
+            f"chr{i}" for i in range(1, 20)
+        ]
+        assert get_chromosomes("autosomesX", genome=genome) == [
+            f"chr{i}" for i in range(1, 20)
+        ] + ["chrX"]
+        assert get_chromosomes("autosomesXY", genome=genome) == [
+            f"chr{i}" for i in range(1, 20)
+        ] + ["chrX", "chrY"]
+        # Direct input ignores genome
+        assert get_chromosomes(["chr1", "chr2"], genome=genome) == ["chr1", "chr2"]
+    finally:
+        genome.cleanup()
+
+
 def test_check_multiclass():
     # Integer labels
     df = pd.DataFrame({"labels": [1, 2]})