diff --git a/src/grelu/data/preprocess.py b/src/grelu/data/preprocess.py index 5f1c925..bff9578 100644 --- a/src/grelu/data/preprocess.py +++ b/src/grelu/data/preprocess.py @@ -560,7 +560,7 @@ def get_gc_matched_intervals( from grelu.sequence.utils import get_unique_length genome_obj = get_genome(genome) - chroms = get_chromosomes(chroms) + chroms = get_chromosomes(chroms, genome=genome_obj) # Get seq_len seq_len = get_unique_length(intervals) @@ -767,7 +767,7 @@ def make_insertion_bigwig( filter_cmd = ( "" if chroms is None - else f"""grep {"".join([f"-e ^{chrom} " for chrom in get_chromosomes(chroms)])} | """ + else f"""grep {"".join([f"-e ^{chrom} " for chrom in get_chromosomes(chroms, genome=genome)])} | """ ) bedgraph_cmd = f"bedtools genomecov -bg -5 -i stdin -g {genome.sizes_file} | " sort_cmd = f"bedtools sort -i stdin > {bedgraph_file}" diff --git a/src/grelu/data/utils.py b/src/grelu/data/utils.py index 61828ae..a561392 100644 --- a/src/grelu/data/utils.py +++ b/src/grelu/data/utils.py @@ -2,6 +2,7 @@ `grelu.data.utils` contains Dataset-related utility functions. """ +import re from typing import List, Optional, Union import numpy as np @@ -33,36 +34,54 @@ def _create_task_data(task_names: List[str]) -> pd.DataFrame: return pd.DataFrame(index=task_names) -def get_chromosomes(chroms: Union[str, List[str]]) -> List[str]: +def get_chromosomes(chroms: Union[str, List[str]], genome=None) -> List[str]: """ Return a list of chromosomes given shortcut names. Args: - chroms: The chromosome name(s) or shortcut name(s). + chroms: The chromosome name(s) or shortcut name(s). Supported + shortcuts are "autosomes", "autosomesX", and "autosomesXY". + genome: A genome object with a ``sizes_file`` attribute (e.g. + from ``get_genome()``). When provided, chromosome names are + read from the genome instead of using hardcoded human defaults. Returns: A list of chromosome name(s). - - Example: - >>> get_chromosomes("autosomes") - ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', - 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', - 'chr20', 'chr21', 'chr22'] """ - # Define shortcuts for chromosome names - chrom_shortcuts = { - "autosomes": ["chr" + str(x) for x in range(1, 23)], - "autosomesX": ["chr" + str(x) for x in range(1, 23)] + ["chrX"], - "autosomesXY": ["chr" + str(x) for x in range(1, 23)] + ["chrX", "chrY"], - } - - # Return the corresponding chromosome names if a shortcut name is given - if isinstance(chroms, str) and (chroms in chrom_shortcuts): - return chrom_shortcuts[chroms] - - # Return the chromosome names if they are given directly - else: - return chroms + shortcuts = {"autosomes", "autosomesX", "autosomesXY"} + + if isinstance(chroms, str) and chroms in shortcuts: + if genome is not None: + # Read actual chromosome names from the genome + sizes = pd.read_table( + genome.sizes_file, + header=None, + names=["chrom", "size"], + dtype={"chrom": str, "size": int}, + ) + all_chroms = set(sizes["chrom"]) + autosomes = sorted( + [c for c in all_chroms if re.match(r"^chr\d+$", c)], + key=lambda c: int(c[3:]), + ) + if chroms == "autosomes": + return autosomes + elif chroms == "autosomesX": + return autosomes + (["chrX"] if "chrX" in all_chroms else []) + else: # autosomesXY + extras = [c for c in ["chrX", "chrY"] if c in all_chroms] + return autosomes + extras + else: + # Fall back to hardcoded human chromosomes + human_autosomes = ["chr" + str(x) for x in range(1, 23)] + if chroms == "autosomes": + return human_autosomes + elif chroms == "autosomesX": + return human_autosomes + ["chrX"] + else: # autosomesXY + return human_autosomes + ["chrX", "chrY"] + + return chroms def _tile_positions( diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py index 29ad344..ea6e457 100644 --- a/tests/test_data_utils.py +++ b/tests/test_data_utils.py @@ -1,3 +1,6 @@ +import os +import tempfile + import numpy as np import pandas as pd import pytest @@ -5,23 +8,60 @@ from grelu.data.utils import _check_multiclass, _create_task_data, get_chromosomes +class _FakeGenome: + """Minimal genome object with a sizes_file for testing.""" + + def __init__(self, chroms): + self._tmpfile = tempfile.NamedTemporaryFile( + mode="w", suffix=".sizes", delete=False + ) + for chrom in chroms: + self._tmpfile.write(f"{chrom}\t1000000\n") + self._tmpfile.flush() + + @property + def sizes_file(self): + return self._tmpfile.name + + def cleanup(self): + os.unlink(self._tmpfile.name) + + def test_get_chromosomes(): # Test direct input assert get_chromosomes(["chr1", "chr2"]) == ["chr1", "chr2"] - # Test shortcut input + # Test shortcut input (no genome = human defaults) assert get_chromosomes("autosomes") == [f"chr{i}" for i in range(1, 23)] - # Test shortcut input assert get_chromosomes("autosomesX") == [f"chr{i}" for i in range(1, 23)] + ["chrX"] - # Test shortcut input assert get_chromosomes("autosomesXY") == [f"chr{i}" for i in range(1, 23)] + [ "chrX", "chrY", ] +def test_get_chromosomes_with_genome(): + # Mouse genome (chr1-chr19 + chrX + chrY) + mouse_chroms = [f"chr{i}" for i in range(1, 20)] + ["chrX", "chrY", "chrM"] + genome = _FakeGenome(mouse_chroms) + try: + assert get_chromosomes("autosomes", genome=genome) == [ + f"chr{i}" for i in range(1, 20) + ] + assert get_chromosomes("autosomesX", genome=genome) == [ + f"chr{i}" for i in range(1, 20) + ] + ["chrX"] + assert get_chromosomes("autosomesXY", genome=genome) == [ + f"chr{i}" for i in range(1, 20) + ] + ["chrX", "chrY"] + # Direct input ignores genome + assert get_chromosomes(["chr1", "chr2"], genome=genome) == ["chr1", "chr2"] + finally: + genome.cleanup() + + def test_check_multiclass(): # Integer labels df = pd.DataFrame({"labels": [1, 2]})