#!/usr/bin/perl # Perl script: ESR3.pl # Authors: Xiang Shao & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # April, 2005 All rights reserved # Permission should be obtained prior to using this program # Contact afedorov@mco.edu # This Perl script is the 3rd in the SCRAMBLING package to screen exon # repetition and exon scrambling. # It maps the beginning and end of each CDS onto the corresponding EST # sequence and calculated the length of the CDS fragment between these # two mapped positions. If the length of this mRNA fragment differed # from the length of EST sequence by over 40 nucleotides, that EST was # output as "candidates". open (OUTPUT1, ">ESTCandidates") || die "Can't open ESTCandidates : $!\n"; open (OUTPUT2, ">ESTOthers") || die "Can't open ESTOthers : $!\n"; open (OUTPUTLowMatch, ">ESTLowMatch") || die "Can't open ESTLowMatch : $!\n"; open (OUTPUTLong, ">ESTLong") || die "Can't open ESTLong : $!\n"; open (OUTSUM, ">ESTCadidatesSumary") || die "Can't open ESTCadidatesSumary : $!\n"; $inf = shift @ARGV; open (INP, "$inf") || die "Can't open $inf : $!\n"; $/ = ">"; $ctTotal = -1; $ctCandidates = $ctOthers = $ctLowMatch = $ctLongESTs = 0; while () { $ctTotal++; next if $ctTotal == 0; @lines = split ("\n", $_); $ESTindex = $lines[0]; print "processing $ctTotal\n" if ($ctTotal % 1000 == 0); $CDSindex = $lines[1]; $numOfMatch = $lines[2]; $estInfo = $lines[3]; $estSeq = $lines[4]; $cdsInfo = $lines[5]; $cdsSeq = $lines[6]; $lenEST = length($estSeq); $lenCDS = length($cdsSeq); if ($lenEST > $lenCDS) { print OUTPUTLong ">$ESTindex\n$CDSindex\n$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n"; $ctLongESTs++; next; } if ($numOfMatch < 21) { print OUTPUTLowMatch ">$ESTindex\n$CDSindex\n$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n"; $ctLowMatch++; next; } undef %cdsSeq28s; %cdsSeq28s = (); $lenCDS2 = $lenCDS - 28; for $y (0..$lenCDS2) { $seq28CDS = substr($cdsSeq, $y, 28); if (exists($cdsSeq28s{$seq28CDS})) { $indexList = $cdsSeq28s{$seq28CDS} . ";" . $y; } else { $indexList = $y; } $cdsSeq28s{$seq28CDS} = $indexList; } $lenEST2 = $lenEST - 28; for $y (0..$lenEST2) { $seq28EST = substr($estSeq, $y, 28); if (exists($cdsSeq28s{$seq28EST})) { $indexListHeads = $cdsSeq28s{$seq28EST}; $ESThead = $y; last; } } for ($y = $lenEST2; $y >= 0; $y--) { $seq28EST = substr($estSeq, $y, 28); if (exists($cdsSeq28s{$seq28EST})) { $indexListTails = $cdsSeq28s{$seq28EST}; $ESTtail = $y; last; } } undef @headIndexes; undef @tailIndexes; undef @CDSlengthes; @CDSlengthes = (); @headIndexes = split(';', $indexListHeads); @tailIndexes = split(';', $indexListTails); for $t (@tailIndexes) { for $h (@headIndexes) { $CDSlength = $t - $h + 1; push @CDSlengthes, $CDSlength if ( $CDSlength >= 50 ); } } $ESTlength = $ESTtail - $ESThead + 1; next if ( $ESTlength < 50 ); # print "CDSlength(es): @CDSlengthes\n" if ($ctTotal % 10 == 0); # print "ESTlength: $ESTlength\n" if ($ctTotal % 10 == 0); $equalLen = 1; for $clen (@CDSlengthes) { if (($clen > $ESTlength && $clen - $ESTlength >= 40) || ($ESTlength > $clen && $ESTlength - $clen >= 40)) { $equalLen = 0; last; } } if ($equalLen != 0) { print OUTPUT1 ">$ESTindex\n$CDSindex\n$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n"; $ctCandidates++; } else { print OUTPUT2 ">$ESTindex\n$CDSindex\n$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n"; $ctOthers++; } } print OUTSUM "ctCandidates: $ctCandidates\n"; print OUTSUM "ctOthers: $ctOthers\n"; print OUTSUM "ctLowMatch: $ctLowMatch\n"; print OUTSUM "ctLongESTs: $ctLongESTs\n"; print OUTSUM "----------------------------------------\n"; print OUTSUM "ctTotal: $ctTotal\n";