#!/usr/bin/perl # Perl script: ESR2.pl # Authors: Xiang Shao & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # Medical College of Ohio, Toledo, Ohio, USA # April, 2005 All rights reserved # Permission should be obtained prior to using this program # Contact afedorov@mco.edu # This Perl script is the 2nd in the SCRAMBLING package to screen exon # repetition and exon scrambling. # It compares each pair of entries of the resulted files from ESR1.pl, # EST1stHalfResult and EST2ndHalfResult to output the best matched ESTs # for the CDSs. open (OUTPUT, ">BestMatched") || die "Can't open BestMatched : $!\n"; open (OUTSUM, ">ESTsGenesSum") || die "Can't open ESTsGenesSum : $!\n"; @firstHalf = @secondHalf = (); open (FIRST, "EST1stHalfResult") || die "Can't open EST1stHalfResult : $!\n"; while () { chomp; push @firstHalf, $_; } close(FIRST); open (SECOND, "EST2ndHalfResult") || die "Can't open EST2ndHalfResult : $!\n"; while () { chomp; push @secondHalf, $_; } close(SECOND); $totalESTs1 = $#firstHalf + 1; $totalESTs2 = $#secondHalf + 1; if ($totalESTs1 == $totalESTs2) { print "\n$totalESTs1\n"; } #else { # print "\nWrong! totalESTs1\ttotalESTs2\n"; # exit; #} open (CDS, "human_CDS") || die "Can't open human_CDS : $!\n"; $/ = ">"; @infoCDS = @seqCDS = (); while () { $cds = ""; @lines = split ("\n", $_); $info = $lines[0]; $info =~ s/>//g; push @infoCDS, $info; for $ln (1..$#lines) { $cds .= $lines[$ln]; } $cds = lc($cds); $cds =~ s/\s//g; $cds =~ s/>//g; push @seqCDS, $cds; } close(CDS); $ctNone = $ctUnresolved = 0; open (EST, "human.EST") || die "Can't open human.EST : $!\n"; #EST database $ESTid = 1; while () { $flagDoOutput = 0; $CDSindex = -1; $matchCt = 0; $e = $ESTid - 1; if ($firstHalf[$e] =~ m/None/ && $secondHalf[$e] =~ m/None/) { $ctNone++; } elsif ($firstHalf[$e] =~ m/None/ && $secondHalf[$e] =~ m/Unresolved/) { $ctUnresolved++; } elsif ($firstHalf[$e] =~ m/Unresolved/ && $secondHalf[$e] =~ m/None/) { $ctUnresolved++; } elsif ($firstHalf[$e] =~ m/Unresolved/ && $secondHalf[$e] =~ m/Unresolved/) { $ctUnresolved++; } elsif ($firstHalf[$e] =~ m/:/ && $firstHalf[$e] !~ m/Unresolved/ && $secondHalf[$e] =~ m/Unresolved/) { if ($firstHalf[$e] =~ m/:/) { $gene1 = $`; $gene1ct = $'; } if ($secondHalf[$e] =~ m/:/) { $gene2ct = $'; } if ($gene1ct > $gene2ct) { $flagDoOutput = 1; $matchCt = $gene1ct; $CDSindex = $gene1; } else { $ctUnresolved++; } } elsif ($secondHalf[$e] =~ m/:/ && $secondHalf[$e] !~ m/Unresolved/ && $firstHalf[$e] =~ m/Unresolved/) { if ($secondHalf[$e] =~ m/:/) { $gene2 = $`; $gene2ct = $'; } if ($firstHalf[$e] =~ m/:/) { $gene1ct = $'; } if ($gene2ct > $gene1ct) { $flagDoOutput = 1; $matchCt = $gene2ct; $CDSindex = $gene2; } else { $ctUnresolved++; } } elsif ($firstHalf[$e] =~ m/:/ && $firstHalf[$e] !~ m/Unresolved/ && $secondHalf[$e] =~ m/None/) { if ($firstHalf[$e] =~ m/:/) { $gene1 = $`; $gene1ct = $'; $flagDoOutput = 1; $matchCt = $gene1ct; $CDSindex = $gene1; } } elsif ($secondHalf[$e] =~ m/:/ && $secondHalf[$e] !~ m/Unresolved/ && $firstHalf[$e] =~ m/None/) { if ($secondHalf[$e] =~ m/:/) { $gene2 = $`; $gene2ct = $'; $flagDoOutput = 1; $matchCt = $gene2ct; $CDSindex = $gene2; } } elsif ($firstHalf[$e] =~ m/:/ && $firstHalf[$e] !~ m/Unresolved/ && $secondHalf[$e] =~ m/:/ && $secondHalf[$e] !~ m/Unresolved/) { if ($firstHalf[$e] =~ m/:/) { $gene1 = $`; $gene1ct = $'; } if ($secondHalf[$e] =~ m/:/) { $gene2 = $`; $gene2ct = $'; } if ($gene1ct > $gene2ct) { $flagDoOutput = 1; $matchCt = $gene1ct; $CDSindex = $gene1; } elsif ($gene2ct > $gene1ct) { $flagDoOutput = 1; $matchCt = $gene2ct; $CDSindex = $gene2; } else { $ctUnresolved++; } } #else { # print "\nWrong! NoSuchCase\n"; # print OUTPUT "NoSuchCase\n"; # exit; # } if ($flagDoOutput == 1) { $estInfo = $estSeq = ""; @lines = split ("\n", $_); $estInfo = $lines[0]; $estInfo =~ s/>//g; for $ln (1..$#lines) { $estSeq .= $lines[$ln]; } $estSeq = lc($estSeq); $estSeq =~ s/\s//g; $estSeq =~ s/>//g; $cdsId = $CDSindex - 1; print OUTPUT ">$ESTid\n$CDSindex\n$matchCt\n$estInfo\n$estSeq"; print OUTPUT "\n$infoCDS[$cdsId]\n$seqCDS[$cdsId]\n"; } print "\n$ESTid" if ($ESTid%10000 == 0); $ESTid++; } close(EST); $percentageNone = $ctNone/$totalESTs1*100; $percentageUnresolved = $ctUnresolved/$totalESTs1*100; print OUTSUM "\nThere are total $ctNone \"None\"s ($percentageNone\%) and $ctUnresolved \"Unresolved\"s ($percentageUnresolved\%)\n";