#!/usr/bin/perl # Perl script: ESR5.pl # Authors: Xiang Shao & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # April, 2005 All rights reserved # Permission should be obtained prior to using this program # Contact afedorov@mco.edu # This Perl script is the 5th in the SCRAMBLING package to screen exon # repetition and exon scrambling. # It analyzes the scores generated in the previous step to judge whether # a candidate EST has exon repetition or exon scrambling or both. It performs # filtering the pseudo-positives due to gt repeat, etc. $prefix = $ARGV[0]; open (OUTPUT, ">RepExons$prefix") || die "Can't open RepExons$prefix : $!\n"; open (OUTPUT2, ">ScrbExons$prefix") || die "Can't open ScrbExons$prefix : $!\n"; open (OUTPUTSUM, ">SumExons$prefix") || die "Can't open SumExons$prefix : $!\n"; open (DBG, ">debug") || die "Can't open debug : $!\n"; open (INP, "$prefix") || die "Can't open $prefix : $!\n"; $ctTotal = -1; $ctRep = $ctScrb = 0; $/ = ">"; while () { $ctTotal++; print "$ctTotal\n" if ($ctTotal%100 == 0); if ($ctTotal == 0) {next;} @lines = split ("\n", $_); ###$ESTindex = $lines[0]; ###$CDSindex = $lines[1]; $numOfMatch = $lines[2]; $estInfo = $lines[3]; $estSeq = $lines[4]; $lenEST = length($estSeq); $lenEST2 = $lenEST - 12; undef @seq12ESTs; for $y (0..$lenEST2) { $seq12EST = substr($estSeq, $y, 12); push @seq12ESTs, $seq12EST; } ### next if $estSeq =~ m/gtgtgtgtgtgtgtgtgtgtgtgt/; $cdsInfo = $lines[5]; $cdsSeq = $lines[6]; $numOfExons = $lines[7]; #print DBG "$numOfExons\n"; $num = $numOfExons - 1; for $n (0..$num) { $m = 2*$n + 8; ${"exonInfo".$n} = $lines[$m]; $m++; ${"exonSeq".$n} = $lines[$m]; } $m++; $exonList = $lines[$m]; $m++; $intronNum = $lines[$m]; $m++; $s = $m; $numi = $intronNum - 1; for $n (0..$numi) { $m = 2*$n + $s; ${"intronInfo".$n} = $lines[$m]; $m++; ${"intronSeq".$n} = $lines[$m]; } $m++; $intronList = $lines[$m]; undef @exons; undef @exonIds; undef @exonCts; @exonIds = @exonCts = (); @exons = split(/;+/, $exonList); $flagRep = $flagScbl = 0; $lastCt = -999; for $m (0..$num) { $exons[$m] =~ /:/; $exonid = $`; $exonMatchCt = $'; if ($exonMatchCt > 1.2) { $flagRep = 1; } elsif ($exonMatchCt > 0.05 && $exonMatchCt <= 1) { push @exonIds, $exonid; push @exonCts, $exonMatchCt; } } # print DBG "exon idsssss: @exonIds\n"; undef @order; @order = (); for $z (0..$#exonIds) { $i = $exonIds[$z] - 1; $exonse = ${"exonSeq".$i}; #print DBG "exon id: $i\n exon seq: $exonse\n"; $lenExon = length($exonse); $lenExon2 = $lenExon - 12; $found = 0; for $y (0..$lenExon2) { $seqExon = lc(substr($exonse, $y, 12)); for $x (0..$#seq12ESTs) { if ($seq12ESTs[$x] eq $seqExon) { push @order, $x; $found = 1; last; } } if ($found == 1) { last; } } } if ($flagRep == 1) { $ctRep++; print OUTPUT ">$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n$numOfExons\n"; for $m (0..$num) { $exonInfo = ${"exonInfo".$m}; $exonSeq = ${"exonSeq".$m}; print OUTPUT "$exonInfo\n$exonSeq\n"; } print OUTPUT "$exonList\n"; print OUTPUT "$intronNum\n"; for $m (0..$numi) { $intronInfo = ${"intronInfo".$m}; $intronSeq = ${"intronSeq".$m}; print OUTPUT "$intronInfo\n$intronSeq\n"; } print OUTPUT "$intronList\n"; next; } $lastTemp = -1; $ctUnordered = 0; for $x (0..$#order) { $temp = $order[$x]; if ($temp < $lastTemp) { $ctUnordered++; } else { $lastTemp = $temp; } } if ($ctUnordered > 0) { $flagScbl = 1; } else { $flagScbl = 0; } if ($flagScbl == 1) { $ctScrb++; print OUTPUT2 ">$numOfMatch\n$estInfo\n$estSeq\n$cdsInfo\n$cdsSeq\n$numOfExons\n"; for $m (0..$num) { $exonInfo = ${"exonInfo".$m}; $exonSeq = ${"exonSeq".$m}; print OUTPUT2 "$exonInfo\n$exonSeq\n"; } print OUTPUT2 "$exonList\n"; print OUTPUT2 "$intronNum\n"; for $m (0..$numi) { $intronInfo = ${"intronInfo".$m}; $intronSeq = ${"intronSeq".$m}; print OUTPUT2 "$intronInfo\n$intronSeq\n"; } print OUTPUT2 "$intronList\n"; print OUTPUT2 "@order\n"; print OUTPUT2 "$ctUnordered\n"; } # print DBG "$numOfExons\n$intronNum\n$exonList\n$intronList\n@order\n\n"; } close(INP); print OUTPUTSUM "Total # of ESTs processed: $ctTotal\n"; print OUTPUTSUM "Total # of candidate ESTs containing repeated exon: $ctRep\n"; print OUTPUTSUM "Total # of candidate ESTs containing scrambled exon: $ctScrb\n";