forked from liulab-dfci/TRUST4
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BuildImgtAnnot.pl
75 lines (68 loc) · 1.65 KB
/
BuildImgtAnnot.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env perl
use strict ;
use warnings ;
die "usage: a.pl species_name(Homo_sapiens or others) > output.fa\n" if ( @ARGV == 0 ) ;
sub system_call
{
print STDERR "SYSTEM CALL: ".join(" ",@_)."\n";
system(@_) == 0
or die "system @_ failed: $?";
#print STDERR " finished\n";
}
my $species = $ARGV[0] ;
# Download
#system_call( "wget -np -nd -r -A fasta -P tmp_download http://www.imgt.org//download/V-QUEST/IMGT_V-QUEST_reference_directory/".$species."/" ) ;
#system_call( "cat tmp_download/*.fasta > tmp_download/vquest.fa") ;
system_call( "wget -O IMGT_download.fa http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithGaps-F+ORF+inframeP" ) ;
# Reformat
#>J00256|IGHJ1*01|Homo sapiens|F|J-REGION|723..774|52 nt|1| | | | |52+0=52| | |
#
open FP1, "IMGT_download.fa" ;
my $prevId = "" ;
my $prevGeneName = "" ;
my $output = 0 ;
my $skipHeader = 0 ; # For the genes that are split into pieces in the database.
while ( <FP1> )
{
if ( /^>/ )
{
my @cols = split /\|/, substr( $_, 1 ) ;
my $s = $cols[2] ;
$s =~ tr/ /_/ ;
if ( !($s =~ /$species/) )
{
$output = 0 ;
}
elsif ( !( $cols[1] =~ /^IG/ ) && !($cols[1] =~ /^TR/) )
{
$output = 0 ;
}
else
{
$output = 1 ;
if ( $cols[1] eq $prevGeneName )
{
$output = 0 if ( $cols[0] ne $prevId ) ; # Could be some middle part of a gene.
$skipHeader = 1 ;
}
else
{
$skipHeader = 0 ;
}
}
$prevId = $cols[0] ;
$prevGeneName = $cols[1] ;
}
next if ( $output == 0) ;
if ( !/^>/ )
{
$_ =~ tr/acgtn/ACGTN/ ;
print $_ ;
}
elsif ( $skipHeader == 0 )
{
print ">".(split /\|/ )[1]."\n" ;
}
}
close FP1 ;
unlink "IMGT_download.fa" ;