#!/usr/bin/perl # Perl script: ESR4.pl # Authors: Xiang Shao & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # April, 2005 All rights reserved # Permission should be obtained prior to using this program # Contact afedorov@mco.edu # This Perl script is the 4th in the SCRAMBLING package to screen exon # repetition and exon scrambling. # It compares the candidate EST sequences with exon sequences of the # corresponding gene to generate a series of identity scores showing # the proportion of each exon that was present in the EST clone. # These scores are used in the next step of the pipeline. $prefix = $ARGV[0]; open (OUTPUT, ">RsltScores$prefix") || die "Can't open RsltScores$prefix : $!\n"; open (OUTPUTSUM, ">RsltSumScores$prefix") || die "Can't open RsltSumScores$prefix : $!\n"; @exons1 = (); $c = 0; $ct = 1; %HoA = (); open (EXON, "human_exons") || die "Can't open human_exons : $!\n"; $/ = ">"; while () { $c++; chomp; @lines = split ("\n", $_); $info = $lines[0]; $info =~ s/>//g; $info =~ s/\n//g; $info =~ /_NT_/; $id = $`; $id =~ s/\s//; #print "$id\t"; $seq = $lines[1]; $seq =~ s/>//g; $seq =~ s/\n//g; if ($c == 1) { $latestId = $id; push @exons1, $info; push @exons1, $seq; } else { if ($latestId eq $id) { push @{"exons".$ct}, $info; push @{"exons".$ct}, $seq; } else { $HoA{$latestId} = \@{"exons".$ct}; $ct++; @{"exons".$ct} = (); $latestId = $id; push @{"exons".$ct}, $info; push @{"exons".$ct}, $seq; } } # print "$c\n"; } close(EXON); @introns1 = (); $ci = 0; $cti = 1; %HoAi = (); open (INTRON, "human_introns") || die "Can't open human_introns : $!\n"; while () { $ci++; chomp; @linesi = split ("\n", $_); $infoi = $linesi[0]; $infoi =~ s/>//g; $infoi =~ s/\n//g; $infoi =~ /_NT_/; $idi = $`; $idi =~ s/\s//; #print "$idi\t"; $seqi = $linesi[1]; $seqi =~ s/>//g; #$seqi =~ s/\n//g; if ($ci == 1) { $latestIdi = $idi; push @introns1, $infoi; push @introns1, $seqi; } else { if ($latestIdi eq $idi) { push @{"introns".$cti}, $infoi; push @{"introns".$cti}, $seqi; } else { $HoAi{$latestIdi} = \@{"introns".$cti}; $cti++; @{"introns".$cti} = (); $latestIdi = $idi; push @{"introns".$cti}, $infoi; push @{"introns".$cti}, $seqi; } } # print "$ci\n"; } close(INTRON); # foreach $i (keys %HoA) { # $arrayRef = $HoA{$i}; # @es = @$arrayRef; # foreach $e (@es) { # print OUTPUT ">$i\n$e\n"; # } # } open (INP, "$prefix") || die "Can't open $prefix : $!\n"; $ctTotal = -1; $ctSkipped = 0; while () { $ctTotal++; print "$ctTotal\n" if ($ctTotal % 100 == 0); if ($ctTotal == 0) {next;} @lines = split ("\n", $_); $ESTindex = $lines[0]; $CDSindex = $lines[1]; $numOfMatch = $lines[2]; $estInfo = $lines[3]; $estSeq = $lines[4]; if ($estSeq =~ m/gtgtgtgtgtgtgtgtgtgt/) { $ctSkipped++; next;} $cdsInfo = $lines[5]; $cdsInfo =~ /_NT_/; $cdsid = $`; $cdsid =~ s/\s//; $cdsSeq = $lines[6]; undef @exons; undef @introns; if(exists $HoA{$cdsid}) { $addr = $HoA{$cdsid}; # print "$ctTotal\n"; @exons = @$addr; for ($m = 1; $m <= $#exons; $m += 2) { undef %{"exon".$m}; %{"exon".$m} = (); ${"length".$m} = length($exons[$m]); if (${"length".$m} < 12) { next; } ${"max".$m} = ${"length".$m} - 12 + 1; #print ${"max".$m}; print "\t"; ${"ctExon".$m} = 0; ${"length".$m} -=12; for $y (0..${"length".$m}) { $seq12exon = lc(substr($exons[$m], $y, 12)); ${"exon".$m}{$seq12exon} = $y; } } $length = length($estSeq); $length -=12; for $y (0..$length) { $seq12EST = substr($estSeq, $y, 12); for ($m = 1; $m <= $#exons; $m += 2) { if (exists(${"exon".$m}{$seq12EST})) { ${"ctExon".$m}++; } } } $num = ($#exons + 1)/2; print OUTPUT ">$ESTindex\n$CDSindex\n$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n$num\n"; for ($m = 1; $m <= $#exons; $m += 2) { print OUTPUT "$exons[$m-1]\n$exons[$m]\n"; } for ($m = 1; $m <= $#exons; $m += 2) { $n = ($m-1)/2 + 1; if (${"length".$m} < 12) { $perc = -99999; } else { if (${"max".$m} == 0) { $perc = 0; } else { $perc = ${"ctExon".$m}/${"max".$m}; } } # if ($perc < 0.005) { $perc = 0; } $prtPerc = sprintf ("%.2f", $perc); print OUTPUT "$n:$prtPerc;"; } print OUTPUT "\n"; } if(exists $HoAi{$cdsid}) { $addr = $HoAi{$cdsid}; # print "$ctTotal\n"; @introns = @$addr; for ($m = 1; $m <= $#introns; $m += 2) { undef %{"intron".$m}; %{"intron".$m} = (); ${"length".$m} = length($introns[$m]); if (${"length".$m} < 12) { next; } ${"max".$m} = ${"length".$m} - 12 + 1; #print ${"max".$m}; print "\t"; ${"ctIntron".$m} = 0; ${"length".$m} -=12; for $y (0..${"length".$m}) { $seq12intron = lc(substr($introns[$m], $y, 12)); ${"intron".$m}{$seq12intron} = $y; } } $length = length($estSeq); $length -=12; for $y (0..$length) { $seq12EST = substr($estSeq, $y, 12); for ($m = 1; $m <= $#exons; $m += 2) { if (exists(${"intron".$m}{$seq12EST})) { ${"ctIntron".$m}++; } } } $num = ($#introns + 1)/2; print OUTPUT "$num\n"; for ($m = 1; $m <= $#introns; $m += 2) { print OUTPUT "$introns[$m-1]\n$introns[$m]\n"; } for ($m = 1; $m <= $#introns; $m += 2) { $n = ($m-1)/2 + 1; if (${"length".$m} < 12) { $perc = -99999; } else { $perc = ${"ctIntron".$m} } print OUTPUT "$n:$perc;"; } print OUTPUT "\n"; } } close(INP); print OUTPUTSUM "\nctTotal\tctSkipped\n$ctTotal\t$ctSkipped\n";