#!/usr/bin/perl -w use strict; use Math::CDF; use Math::Round; # # The default input parameters # my $debug = 0; my $noplot = 0; my $gnuplot = ""; my $input_file = "protein.txt"; my $window_size = 7; my $threshold = 0.225; my $graph_file = ""; my $result_file = "results.$$"; my $data_file = "sliding.$$"; my $_prompt = ""; # # Find out gnuplot # if( exists $ENV{GNUPLOT_BIN}) { $gnuplot = $ENV{GNUPLOT_BIN}; } if( $gnuplot eq "" ) { $gnuplot = `which gnuplot`; chomp($gnuplot); } if( $gnuplot eq "" ) { print "AMPA requires gnuplot to be installed on your system."; exit 2; } # # Parse the command # my $cl=join( " ", @ARGV); if (($cl=~/-h/) ||($cl=~/-H/) ) { # print usage print "Usage: AMPA.pl -in= [other options]\n"; print "\n"; print "Available options:\n"; print "-in Fasta sequence input file\n"; print "-t Threshold value (default: 7)\n"; print "-w Window size value (default: 0.225)\n"; print "-rf File where store the program result\n"; print "-df File where store the produced plot data\n"; print "-gf File where store the generated plot\n"; print "-noplot Skip plot creation step\n"; exit 1; } # # If not command line is provided switch to interactive mode # if ($#ARGV==-1 ) { # the input file name print "Enter the name of the sequence file and then press Enter [$input_file]: "; $_prompt = ; chomp($_prompt); if( $_prompt ne "" ) { $input_file=$_prompt }; # the window size print "Enter the desired window size as an odd number and then press Enter [$window_size]: "; $_prompt = ; chomp($_prompt); if( $_prompt ne "" ) { $window_size = $_prompt }; # the threshold print "Enter the desired threshold and then press Enter [$threshold]: "; $_prompt = ; chomp($_prompt); if( $_prompt ne "" ) { $threshold = $_prompt }; } # option '-in': input file name if ( ($cl=~/-in=\s*(\S+)/)) { $input_file = $1; } # option '-w': window size if ( ($cl =~ /-w=\s*(\S+)/)) { $window_size = $1; } # option '-t': threshold value if ( ($cl =~ /-t=\s*(\S+)/)) { $threshold = $1; } # option '-gf': output graph file if ( ($cl =~ /-gf=\s*(\S+)/)) { $graph_file = $1; } # option '-rf': output result file if ( ($cl =~ /-rf=\s*(\S+)/)) { $result_file = $1; } # option '-df': output sliding file if ( ($cl =~ /-df=\s*(\S+)/)) { $data_file = $1; } # option '-debug': enable debug mode if ( ($cl =~ /-debug/)) { $debug = 1; } # option '-noplot': threshold value if ( ($cl =~ /-noplot/)) { $noplot = 1; } if( $debug != 0 ) { print "Window size : $window_size\n"; print "Threshold value : $threshold\n"; print "Input file name : $input_file\n"; print "Graph file name : $graph_file\n"; print "Result file name : $result_file\n"; print "Data file name : $data_file\n"; print "Gnuplot bin : $gnuplot\n"; } # # Main script start here # my(@names,@lengths,@sequences,$sequence_number,$protein); open(PROTEIN, "$input_file") or die "Cannot open '$input_file': $!\n"; #The program should read the user input sequence $/ = ">"; for(my $c=1; $c<(($window_size + 1)/2); $c++) { push (@sequences, 'X'); } while (my $sequence_record = ) { if ($sequence_record eq ">") { next; } my $sequence_name = ""; if ($sequence_record =~ m/([^\n]+)/) { $sequence_name = $1; } else { $sequence_name = "No title was found!"; } $sequence_record =~ s/[^\n]+//; push (@names, $sequence_name); $sequence_record =~ s/[^ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy]//g; push (@sequences, $sequence_record); push (@lengths, length($sequence_record)); my $sequence_length = length($sequence_record); } for(my $c=1; $c<(($window_size + 1)/2); $c++) { push (@sequences, 'X'); } my $sequences = join("",@sequences); close(PROTEIN) or die "Cannot close the file: $!\n"; #The program calls the subroutines to analyze the sequence &sliding($sequences); if(!$noplot) { &gnuplot_sld(); } ### Subroutines # Defines the value at the window center (window_size+1)/2 sub sliding { open(KD,">$data_file") or die("Could not open '$data_file': $!\n"); open(RS,">$result_file") or die("Could not open '$result_file': $!\n"); my($center,$length); my $half = (($window_size + 1)/2); # These are the antimicrobial propensity values for each amino acid my %sliding = ( 'A', 0.307, 'R', 0.106, 'N', 0.240, 'D', 0.479, 'C', 0.165, 'Q', 0.248, 'E', 0.449, 'G', 0.265, 'H', 0.202, 'I', 0.198, 'L', 0.246, 'K', 0.111, 'M', 0.265, 'F', 0.246, 'P', 0.327, 'S', 0.281, 'T', 0.242, 'W', 0.172, 'Y', 0.185, 'V', 0.200, 'X', $threshold ); $protein = shift; print KD "# The window size is currently set at $window_size.\n"; print KD "# Here are the AMSI values for this protein:\n"; print KD "# $names[0]\n"; print KD "# Pos\ APV\n"; print KD "# ---\t-------------------\n"; my $acc1=0; my $acc2=0; my $bacindex=0; my $amvalue=0; my $bacvalue=0; my $prob=0; for(my $i=0;$i<=(length($protein)-$window_size);$i++) { my $window = substr($protein, $i, $window_size); my $sum=0; for(my $j=0; $j<$window_size; $j++) { my $PV = 0; my $residue = substr($window, $j, 1); $PV = $sliding{$residue}; $sum+=$PV; } $center = $i + $half; my $position = $center - 3; my $APV=$sum/$window_size; $APV = sprintf "%.3f", $APV; print KD "$position\t$APV\n"; $amvalue=$amvalue+$APV; if($acc1==3) { if($acc2>=10){ my $init=$position-$acc2-2; my $last=$position-1; my $bacvalue=(($bacvalue)/($last-$init+1)); $prob = &Math::CDF::pnorm(($bacvalue-0.2584)/0.02479); $prob*=100; my $rbacvalue = nearest(.001, $bacvalue); my $rprob = nearest(1, $prob); print RS "Antimicrobial stretch found in $init to $last. Propensity value $rbacvalue ($rprob %) \n"; $bacindex++; $acc2=0; $acc1=0; $bacvalue=0; $prob=0; } if($acc2<10){ $acc1=0; $acc2=0; $bacvalue=0; } } if($acc1!=3){ if($acc2==0) { $acc1=0; $bacvalue=0; } if($APV<=$threshold) { $acc2++; $bacvalue+=$APV } if($APV>$threshold){ $acc1++; $bacvalue+=$APV } } if($position==(length($protein)-(($window_size)-1)) && $acc2>=10) { my $init=$position-$acc2-2; my $last=$position-1; my $bacvalue=(($bacvalue)/($last-$init+1)); $bacindex++; $prob = &Math::CDF::pnorm(($bacvalue-0.2584)/0.02479); $prob*=100; my $rbacvalue = nearest(.001, $bacvalue); my $rprob = nearest(1, $prob); print RS "Antimicrobial stretch found in $init to $last. Propensity value $rbacvalue ($rprob %) \n"; } } print RS "# This protein has $bacindex bactericidal stretches \n"; my $manvalue = nearest(.001, $amvalue/(length($protein)-6)); print RS "# This protein has a mean antimicrobial value of $manvalue \n"; close(KD) or die("Could not close file: $!\n"); } sub gnuplot_sld { my $cmdline = "|$gnuplot -persist"; open (GP, $cmdline) or die "no gnuplot"; # force buffer to flush after each write use FileHandle; GP->autoflush(1); print GP "set title \"Antimicrobial Profile\"\n"; print GP "set xlabel \"Position\"\n"; print GP "set ylabel \"Score\"\n"; print GP "set grid\n"; print GP "set data style lines\n"; # if an output has been specified output to it if( $graph_file ne "" ) { print GP "set terminal png large enhanced size 800 600\n"; print GP "set output '$graph_file' \n"; } print GP "plot \"$data_file\" u 1:2 t \"Antimicrobial profile \" w lines\n"; close GP; }