#!/usr/bin/perl -w use Getopt::Std; #use strict; require 'ctime.pl'; require "/home/vossman/lib/fixnum.pl"; my %args; getopt("ioart", \%args); #GLOBAL HASHES my %atomrad; my %isType; my %typeOK; my %typePoss; my %typeBad; my $typeExist; my $globaltype = "rna"; #my $globaltype = "prot"; #GET TIME my @times = split(' ', &ctime(time)); my $time = $times[1].'-'.$times[2].'-'.$times[4]; #SET INPUT VARIABLES my $restype = $args{t} || "rna"; my ($in,$resi,$atom); if($globaltype eq "prot") { $in = $args{i} || "/home/vossman/phos/out/protein-edit.out"; $resi = $args{r} || "/home/vossman/phos/ProtOr.resi-defs.dat"; $atom = $args{a} || "/home/vossman/phos/ProtOr.atom-defs.dat"; } elsif($globaltype eq "rna") { $in = $args{i} || "/home/vossman/phos/out/edit/ALL-854369-edit.out"; $resi = $args{r} || "/home/vossman/phos/spec-defs.dat"; $atom = $args{a} || "/home/vossman/phos/atom-defs.dat"; } my $out = $args{o} || "./atom-vols-$globaltype-$time.dat"; #my $out2 = $out; #substr($out2,-4) = "-MORE.dat"; main(); #end sub main { #OPEN FILES open(OUT, "> $out") or die "Couldn't open $out for writing: $!\n"; #open(OUT2, "> $out2") # or die "Couldn't open $out2 for writing: $!\n"; print "Writing to $out...\n"; my $header = "# NucProt type set of Atom Volumes\n#\n# Generated: ". (&ctime(time))."# Input file $in\n# v 3.2 2/26/04\n"; $header .= "# Output file $out\n". "# Atom file $atom\n# Type \'$globaltype\'\n#\n#\n#\n\n"; $header .= "type \tcount\t\%\[OK\]\trad \tmedian\t mode\t mean \t sd \t10%ile\t90%ile". "\tskew \tkurt \tnum\tatoms\n". "-----\t-----\t-----\t----\t------\t-----\t------\t----\t-----". "\t-----\t-----\t-----\t---\t". hyph(40)."\n"; print OUT $header; #print OUT2 $header; my @atomdefs = parseAtom($atom); @atomdefs = sort { $a cmp $b } @atomdefs; print STDERR "done parsing atom file (",($#atomdefs+1), " types found)\n\n"; #my @res = parseResi($resi); #print STDERR "done parsing resi file (",($#res+1)," found)\n"; my $i = -1; my ($count,$found,$notresi,$nottype,$notgood) = (0,0,0,0,0); my $total = numLines($in); my $carrot = $total/50; open(IN, "< $in") or die "Couldn't open $in for reading: $!\n"; print STDERR "Parsing in file ($total lines)\n", "|----+----+----+----+----|----+----+----+----+----|\n"; while(defined(my $line = )) { my @chunks = split('\t',$line); foreach my $test (@chunks) { $test =~ s/\s+$//; $test =~ s/^\s+//; } #CHECK IF VALIDITY if(!$isType{$chunks[6]}) { $nottype++; } elsif(!isResi($chunks[3])) { $notresi++; } #MODIFY FOR PROTEIN elsif(isResi($chunks[3]) && $isType{$chunks[6]}) { $typeExist{$chunks[6]}=1; if($chunks[13] eq "\[OK\]") { push(@{$chunks[6]},$chunks[10]); if(!$typeOK{$chunks[6]}) { $typeOK{$chunks[6]}=0; } $typeOK{$chunks[6]}++; $found++; } elsif($chunks[13] eq "possible") { if(!$typePoss{$chunks[6]}) { $typePoss{$chunks[6]}=0; } $typePoss{$chunks[6]}++; $found++; } elsif($chunks[13] eq "bad") { if(!$typeBad{$chunks[6]}) { $typeBad{$chunks[6]}=0; } $typeBad{$chunks[6]}++; $found++; } else { $notgood++; } if($count > $carrot) { print STDERR "^"; $carrot += ($total/50); } } $count++; if($count > $carrot) { print STDERR "^"; $carrot += ($total/50); } } close(IN); print STDERR "\nFound $found of $total (parsed $count) ", "[r$notresi,t$nottype,g$notgood]\n"; # print STDERR "\n [ "; print STDERR "\n\n"; #print STDERR $header; foreach my $type (@atomdefs) { if($typeExist{$type}) { my $line; my $perc = fixnum21(100*$typeOK{$type}/($typeOK{$type}+$typePoss{$type}+$typeBad{$type})); $line = "$type\t".fixnum50($typeOK{$type})."\t".$perc."%\t"; my ($mean,$median,$stdev,$min,$max,$mode,$skew,$kurt) = analyzeVol($type,@{$type}); $line .= fixnum12($atomrad{$type})."\t".fixnum32($median)."\t".fixnum22($mode)."\t".fixnum23($mean). "\t".fixnum12($stdev)."\t".fixnum22($min)."\t".fixnum22($max)."\t". fixnum22($skew)."\t".fixnum22($kurt)."\t"; #my (@atoms) = parseResi($resi,$type); my (@atoms) = 0; $line .= fixnum30($#atoms+1)."\t"; #print STDERR $line,"atoms printed to file\n"; foreach my $atm (@atoms) { $line .= "$atm,"; } substr($line,-1) = ""; $line .= "\n"; print OUT $line; } } close(OUT); #close(OUT2); }; # end main() sub isResi { my ($res) = @_; if($globaltype =~ "prot") { return isProt($res); } elsif($globaltype =~ "rna") { return isRNA($res); } return 0; }; sub isRNA { my ($res) = @_; my @RNA = ("A","C","U","G","ADE","CYT","URI","GUA"); foreach my $base (@RNA) { if($res eq $base) { return 1; } } return 0; }; sub isProt { my ($res) = @_; my @PROT = ("GLY","ALA","MET","GLU","GLN","ASN","ASP","TRP","THR","TYR", "ILE","LEU","VAL","CYS","PRO","ARG","SER","LYS","PHE","HIS", "CYH","CSS"); foreach my $aa (@PROT) { if($res eq $aa) { return 1; } } return 0; }; sub mode { my (@list) = @_; my %modes; my $p = 1.69897000434; # precision my @vals; foreach my $vol (@list) { my $v = int($vol*(10**$p)+0.5); $v /= (10**$p); if($modes{$v}) { $modes{$v}++; } else { push(@vals,$v); $modes{$v}=1; } } my ($mode,$modemax); $modemax = 1; my @crash; foreach my $v (@vals) { #print STDERR "$v\n"; if(!$modes{$v} || !$modemax) { #??? } elsif($modes{$v} > $modemax) { $mode = $v; $modemax = $modes{$v}; @crash = ($v); } elsif($modes{$v} == $modemax) { push(@crash,$v); } } if($#crash > 0) { print STDERR " Crash on Mode \[$modemax\]: "; foreach my $v (@crash) { print STDERR fixnum($v,2,int($p+0.9),"r"),", "; } print STDERR "\n"; } return $mode; }; sub mode2 { my (@list) = @_; my %modes; my $p = 0.01; # precision my @vals; foreach my $vol (@list) { my $v = int($vol/$p+0.5)*$p; if($modes{$v}) { $modes{$v}++; } else { push(@vals,$v); $modes{$v}=1; } } my ($mode,$modemax); $modemax = 1; my @crash; my $grange = 50; foreach my $v (@vals) { my $avgmode=0; for(my $k=-$grange;$k<=$grange;$k++) { my $v2 = int(($v - $k*$p)*1000 + $p/100)/1000; if($modes{$v2}) { my $factor = exp(-0.05*($k**2)); $factor = int($factor*1000)/1000; $avgmode += $modes{$v2}*$factor; } } if(!$avgmode) { #??? } elsif($avgmode > $modemax) { $mode = $v; $modemax = $avgmode; @crash = ($v); } elsif($avgmode == $modemax) { push(@crash,$v); } } if($#crash > 0) { print STDERR " Crash on Mode \[$modemax\]: "; foreach my $v (@crash) { print STDERR fixnum($v,2,int(-1*log($p)/log(10)+0.95),"r"),", "; } print STDERR "\n"; } return $mode; }; sub hyph { my ($num) = @_; my $str; for(my $i=0; $i<$num; $i++) { $str.="-"; } return $str; }; sub parseResi { my ($resi,$type) = @_; my @list; open(RESI, "< $resi") or die "Couldn't open $resi for reading: $!\n"; my $i; my $res; while(defined(my $line = )) { my @chunks = split(' ', $line); if(!$i) { if($chunks[1]) { $res=$chunks[0]; $i=$chunks[1]; } } elsif($res) { if($chunks[0] && $chunks[1] eq $type) { push(@list,$res."_".$chunks[0]); } $i--; } } close(RESI); return @list; }; sub parseAtom { ($atom) = @_; my @atomdefs; open(ATOM, "< $atom") or die "Couldn't open $atom for reading: $!\n"; while(defined(my $line = )) { if(length($line) > 3) { my @chunks = split(' ', $line); if($#chunks > 1) { push(@atomdefs,$chunks[0]); $isType{$chunks[0]}=1; $atomrad{$chunks[0]}=$chunks[1]; } else { print STDERR "E"; } } } close(ATOM); return @atomdefs; }; sub numLines { my ($in) = @_; open(IN, "< $in") or die "Couldn't open $in for reading: $!\n"; my $count=0; while(defined(my $line=)) { $count++; } close(IN); return $count; }; sub analyzeVol { my ($type,@vol) = @_; @vol = sort { $a <=> $b } @vol; my $cut = int(0.10*$typeOK{$type}); my ($min,$max) = ($vol[$cut],$vol[$#vol-$cut]); my ($mean,$stdev) = meanstdev(@vol); my $median = median(@vol); my $mode = mode2(@vol); my $skew = skew($mean,$stdev,@vol); my $kurt = kurtosis($mean,$stdev,@vol); return ($mean,$median,$stdev,$min,$max,$mode,$skew,$kurt); }; sub meanstdev { my (@nums) = @_; my ($sum,$sumsq) = (0,0); foreach my $num (@nums) { $sum += $num; $sumsq += ($num**2); } my $mean = ($sum/($#nums+1)); my $stdev = sqrt(($sumsq-$sum**2/($#nums+1))/$#nums); return ($mean,$stdev); }; sub median { #assume sorted my (@vol) = @_; my $median; if($#vol%2 == 0) { $median = ($vol[($#vol)/2] + $vol[($#vol)/2+1])/2; } else { $median = $vol[($#vol+1)/2]; } }; sub skew { my ($mean,$stdev,@list) = @_; my $skew; foreach my $v (@list) { $skew += ($v-$mean)**3 / ($#list + 1); } $skew /= $stdev**3; return $skew; } sub kurtosis { my ($mean,$stdev,@list) = @_; my $kurt; foreach my $v (@list) { $kurt += ($v-$mean)**4 / ($#list + 1); } $kurt = ($kurt/$stdev**4)-3; return $kurt; }