forked from SIMBAL2/SIMBAL_SOFTWARE_SUITE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomain_tool.pl
125 lines (102 loc) · 3.31 KB
/
domain_tool.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
use FileHandle;
use Getopt::Long;
use List::Util qw[min max];
#use warnings;
my $junk = "extraction.tmp";
my $usage = "
This program requires the following flags:
-seqs <file path to the FASTA file>
-hmm <file path to the HMM>
-domain <domain score cutoff>
-protein <protein score cutoff>
-full <print full sequences (default is no)>
-extend <amount to extend domain match regions by>
Optional flag:
-tmp <override the default location for the HMM results>
";
my %domains = ();
GetOptions ( 'seqs=s' => \$seqs,
'hmm=s' => \$hmm,
'domain=s' => \$domain_cutoff,
'protein=s' => \$protein_cutoff,
'tmp=s' => \$junk,
'extend=i' => \$extend,
'full' => \$full);
if (!$hmm || !$domain_cutoff || !$protein_cutoff) {die "$usage\n";}
if (!$seqs) {die "$usage\n";}
`hmmsearch -T $protein_cutoff --domtblout $junk $hmm $seqs`;
#`hmm3search -T $protein_cutoff --domtblout $junk $hmm $seqs`; <- JCVI alias
read_domtblout($junk);
search_fasta($seqs);
#########################################################################################################################################################################
sub read_domtblout
{
my $source = shift;
open (TABLE, "$source") || die "cannot read domtblout file\n";
while ($line = <TABLE>)
{
if (!($line =~ /^#/)) # ignore the header line
{
@split_line = split(/\s+/, $line); # splits on tabs - some entries contain whitespace
if ($split_line[13] >= $domain_cutoff)
{
push(@{$domains{$split_line[0]}[0]}, $split_line[19]); # adds "env from" coordinate to array
push(@{$domains{$split_line[0]}[1]}, $split_line[20]); # adds "env to" coordinate to array
# %domains is a hash, but $domains{identifier}[0] and $domains{$identifier}[1] are both arrays
# this way, all domains from one sequence are stored with the same hash key, but can easily be processed iteratively
}
}
}
close TABLE;
}
##########################################################################################################################################################################
sub search_fasta
{
my $fasta = shift;
open (FASTA, "$fasta") || die "cannot read sequence file\n";
my $tmp_seq = '';
while ($line = <FASTA>)
{
if ($line =~ /^>(.*?)\s/)
{
if ($domains{$1})
{
$identifier = $1;
$tmp_seq = '';
if ($line =~ /(\S*)\s/)
{
$header = $1;
}
else
{
$header = $line;
}
while ($seq_line = <FASTA>)
{
if (!($seq_line =~ />/))
{
chomp($seq_line);
$tmp_seq = ($tmp_seq . $seq_line)
}
else {last;}
}
for ($i = 0; $i <= $#{$domains{$identifier}[0]}; $i++)
{
$from = $domains{$identifier}[0][$i];
$from = max(0, $from-$extend);
$to = $domains{$identifier}[1][$i];
$to = min($to+$extend, length($tmp_seq));
$length = ($to - $from + 1);
# $tmp_seq =~ /.{$from}(.{$length})/;
$sub_seq = substr $tmp_seq, $from, $length;
if ($tmp_seq && $full) {chomp($header); print("$header\n$tmp_seq\n\n")}
elsif ($sub_seq && $length) {chomp($header); print("$header\/$from\-$to\n$sub_seq\n\n");}
}
}
else
{
next;
}
}
}
}