snakeMake/clusterRNAcode.pl at master · htafer/snakeMake · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/perl
# ===========================================================================
# clusterRNAcode.pl
# ===========================================================================
#
# Description:
#   Clusters and merges overlapping RNAcode predictions from sorted input.
#   Takes RNAcode output sorted by chromosome, start, and end positions,
#   and merges overlapping regions while tracking best scores and p-values.
#
# Usage:
#   cat rnaCode.xx* | sort -k 1,1 -k 4,4g -k 5,5g | grep Supercontig | perl clusterRNAcode.pl
#
# Input Format (tab-delimited):
#   chr source feature start end score|pvalue strand frame attributes
#
# Output Format (GTF):
#   chr RNAcode CDS start end score|pvalue strand . gene_id "GeneX"
#
# Dependencies:
#   Perl 5.10 or higher
#   Data::Dumper (for debugging)
#
# Author: htafer
# Last Updated: 2025-07-28
# ===========================================================================

use strict;
use warnings;
use feature 'say';
use Data::Dumper;

# Constants
use constant {
    MIN_SCORE => 1,
    MIN_PVALUE => 1,
    DEFAULT_STRAND => "/",
    EMPTY_CHR => "",
    INVALID_POS => -1,
};

# Initialize locus data structure with default values
my $locus = {
    prChr => EMPTY_CHR,
    prStart => INVALID_POS,
    prStop => INVALID_POS,
    prStrand => DEFAULT_STRAND,
    prScore => MIN_SCORE,
    prPValue => MIN_PVALUE,
    count => 0,
};
# Subroutine to print a locus in GTF format
sub print_locus {
    my ($locus) = @_;
    return if $locus->{prChr} eq EMPTY_CHR;  # Skip empty initial locus

    say join("\t",
        $locus->{prChr},
        "RNAcode",
        "CDS",
        $locus->{prStart},
        $locus->{prStop},
        $locus->{prScore} . "|" . $locus->{prPValue},
        $locus->{prStrand},
        ".",
        "gene_id \"Gene" . $locus->{count} . "\""
    );
}

# Process input line by line
while (my $line = <STDIN>) {
    chomp $line;

    # Parse input line
    my @data = split(/\t/, $line);
    die "Invalid input format: expected 9 fields, got " . scalar(@data)
        unless @data >= 7;

    # Extract score and p-value
    my ($pValue, $score) = split(/\|/, $data[5]);
    die "Invalid score format: $data[5]" unless defined $pValue && defined $score;

    # Check if current feature should start a new locus
    if ($data[0] ne $locus->{prChr} ||
        $data[3] > $locus->{prStop} ||
        $data[6] ne $locus->{prStrand}) {

        # Print previous locus
        print_locus($locus);
        $locus->{count}++;

        # Initialize new locus
        $locus->{prChr} = $data[0];
        $locus->{prStart} = int($data[3]);
        $locus->{prStop} = int($data[4]);
        $locus->{prStrand} = $data[6];
        $locus->{prScore} = $score;
        $locus->{prPValue} = $pValue;
    }
    else {
        # Merge with existing locus
        $locus->{prStart} = int($data[3]) if int($data[3]) < $locus->{prStart};
        $locus->{prStop} = int($data[4]) if int($data[4]) > $locus->{prStop};
        $locus->{prPValue} = $pValue if $pValue < $locus->{prPValue};
        $locus->{prScore} = $score if $score < $locus->{prScore};
    }
}

# Print final locus
print_locus($locus);