Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/grelu/data/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def get_gc_matched_intervals(
from grelu.sequence.utils import get_unique_length

genome_obj = get_genome(genome)
chroms = get_chromosomes(chroms)
chroms = get_chromosomes(chroms, genome=genome_obj)

# Get seq_len
seq_len = get_unique_length(intervals)
Expand Down Expand Up @@ -767,7 +767,7 @@ def make_insertion_bigwig(
filter_cmd = (
""
if chroms is None
else f"""grep {"".join([f"-e ^{chrom} " for chrom in get_chromosomes(chroms)])} | """
else f"""grep {"".join([f"-e ^{chrom} " for chrom in get_chromosomes(chroms, genome=genome)])} | """
)
bedgraph_cmd = f"bedtools genomecov -bg -5 -i stdin -g {genome.sizes_file} | "
sort_cmd = f"bedtools sort -i stdin > {bedgraph_file}"
Expand Down
63 changes: 41 additions & 22 deletions src/grelu/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
`grelu.data.utils` contains Dataset-related utility functions.
"""

import re
from typing import List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -33,36 +34,54 @@ def _create_task_data(task_names: List[str]) -> pd.DataFrame:
return pd.DataFrame(index=task_names)


def get_chromosomes(chroms: Union[str, List[str]]) -> List[str]:
def get_chromosomes(chroms: Union[str, List[str]], genome=None) -> List[str]:
"""
Return a list of chromosomes given shortcut names.

Args:
chroms: The chromosome name(s) or shortcut name(s).
chroms: The chromosome name(s) or shortcut name(s). Supported
shortcuts are "autosomes", "autosomesX", and "autosomesXY".
genome: A genome object with a ``sizes_file`` attribute (e.g.
from ``get_genome()``). When provided, chromosome names are
read from the genome instead of using hardcoded human defaults.

Returns:
A list of chromosome name(s).

Example:
>>> get_chromosomes("autosomes")
['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
'chr20', 'chr21', 'chr22']
"""
# Define shortcuts for chromosome names
chrom_shortcuts = {
"autosomes": ["chr" + str(x) for x in range(1, 23)],
"autosomesX": ["chr" + str(x) for x in range(1, 23)] + ["chrX"],
"autosomesXY": ["chr" + str(x) for x in range(1, 23)] + ["chrX", "chrY"],
}

# Return the corresponding chromosome names if a shortcut name is given
if isinstance(chroms, str) and (chroms in chrom_shortcuts):
return chrom_shortcuts[chroms]

# Return the chromosome names if they are given directly
else:
return chroms
shortcuts = {"autosomes", "autosomesX", "autosomesXY"}

if isinstance(chroms, str) and chroms in shortcuts:
if genome is not None:
# Read actual chromosome names from the genome
sizes = pd.read_table(
genome.sizes_file,
header=None,
names=["chrom", "size"],
dtype={"chrom": str, "size": int},
)
all_chroms = set(sizes["chrom"])
autosomes = sorted(
[c for c in all_chroms if re.match(r"^chr\d+$", c)],
key=lambda c: int(c[3:]),
)
if chroms == "autosomes":
return autosomes
elif chroms == "autosomesX":
return autosomes + (["chrX"] if "chrX" in all_chroms else [])
else: # autosomesXY
extras = [c for c in ["chrX", "chrY"] if c in all_chroms]
return autosomes + extras
else:
# Fall back to hardcoded human chromosomes
human_autosomes = ["chr" + str(x) for x in range(1, 23)]
if chroms == "autosomes":
return human_autosomes
elif chroms == "autosomesX":
return human_autosomes + ["chrX"]
else: # autosomesXY
return human_autosomes + ["chrX", "chrY"]

return chroms


def _tile_positions(
Expand Down
46 changes: 43 additions & 3 deletions tests/test_data_utils.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,67 @@
import os
import tempfile

import numpy as np
import pandas as pd
import pytest

from grelu.data.utils import _check_multiclass, _create_task_data, get_chromosomes


class _FakeGenome:
"""Minimal genome object with a sizes_file for testing."""

def __init__(self, chroms):
self._tmpfile = tempfile.NamedTemporaryFile(
mode="w", suffix=".sizes", delete=False
)
for chrom in chroms:
self._tmpfile.write(f"{chrom}\t1000000\n")
self._tmpfile.flush()

@property
def sizes_file(self):
return self._tmpfile.name

def cleanup(self):
os.unlink(self._tmpfile.name)


def test_get_chromosomes():
# Test direct input
assert get_chromosomes(["chr1", "chr2"]) == ["chr1", "chr2"]

# Test shortcut input
# Test shortcut input (no genome = human defaults)
assert get_chromosomes("autosomes") == [f"chr{i}" for i in range(1, 23)]

# Test shortcut input
assert get_chromosomes("autosomesX") == [f"chr{i}" for i in range(1, 23)] + ["chrX"]

# Test shortcut input
assert get_chromosomes("autosomesXY") == [f"chr{i}" for i in range(1, 23)] + [
"chrX",
"chrY",
]


def test_get_chromosomes_with_genome():
# Mouse genome (chr1-chr19 + chrX + chrY)
mouse_chroms = [f"chr{i}" for i in range(1, 20)] + ["chrX", "chrY", "chrM"]
genome = _FakeGenome(mouse_chroms)
try:
assert get_chromosomes("autosomes", genome=genome) == [
f"chr{i}" for i in range(1, 20)
]
assert get_chromosomes("autosomesX", genome=genome) == [
f"chr{i}" for i in range(1, 20)
] + ["chrX"]
assert get_chromosomes("autosomesXY", genome=genome) == [
f"chr{i}" for i in range(1, 20)
] + ["chrX", "chrY"]
# Direct input ignores genome
assert get_chromosomes(["chr1", "chr2"], genome=genome) == ["chr1", "chr2"]
finally:
genome.cleanup()


def test_check_multiclass():
# Integer labels
df = pd.DataFrame({"labels": [1, 2]})
Expand Down
Loading