#!/usr/bin/perl
# This script creates the peptide database starting from a fasta database downloaded. 
#
# By Alessandra Tiengo - Lab. Informatica Medica - Dip. Informatica e 
# Sistemistica - Univ. Pavia - 2011
#
# Algorithm submitted to BMC Bioinformatics by Tiengo et al. 
#

use IO::Zlib;
use Cwd;

###subroutines ###

######controls the user input
sub answer {
	my $answer;
	$answer = $_[0];
	$answer = uc($answer);
	while ( $answer =~ /^[^YN]/ ) {
		print "\nWrong input. Answer Y o N\n";
		$answer = uc(<STDIN>);
		chomp($answer);
	}
	return $answer;
}
#####

#####this subroutine controls a numeric input
sub number {
	local ( $number, $n, $s, $a, $i );
	$number = $_[0];
	$n      = $_[1];
	while ( $number =~ /[\D+]/ || $number > $n ) {
		print "\nWrong input. Insert a number from 0 to $a\n";
		$number = <STDIN>;
		chomp($number);
	}
	return $number;
}
#####

#####this subroutine computes the molecular weight of a protein or a peptide
sub molecular_weight {
	local (
		$mol,    $ref_aa, $ref_mw,  @aa,        @mw,
		$weight, $i,      $total_w, $number_aa, $flag
	);
	$mol    = $_[0];      #protein or peptide
	$ref_aa = $_[1];
	$ref_mw = $_[2];
	$flag   = $_[3];      #0 if protein 1 if peptide
	                      #takes the arrays from the mamory addresses
	@aa     = @$ref_aa;
	@mw     = @$ref_mw;
	if ( $flag == 0 ) {
		$weight = 18.01524;    #average molecular weight of water
	}
	else {
		$weight = 18.01056;    #monoisotopic molecular weight of water
	}
	for ( $i = 0 ; $i < @aa ; $i++ ) {
		$number_aa = ( $mol =~ s/$aa[$i]/$aa[$i]/g );

		#computes mw
		$total_w   = $mw[$i] * $number_aa;
		$number_aa = 0;

		#updates mw
		$weight = $weight + $total_w;
	}
	return $weight;
}

#####

#this subroutine computes the isolectric point of a protein
sub isoel_point{
	local($protein, $ref_aminoq, $ref_pk, $ref_charge, @aminoq, @pk, @charge, $ph, $number_aa);
	local($ph_prev, $ph_next, $cr_cooh, $cr_nh2, $partial_cooh, $partial_nh2, $total_charge);
	local($i, $cr, $partial, $E, $temp, $flag_while, $flag_cond,$c_term,$n_term);
	$protein=$_[0];
	$ref_aminoq=$_[1];	
	$ref_pk=$_[2];	
	$ref_charge=$_[3];	
    #retrieves the arrays
    @aminoq=@$ref_aminoq;
	@pk=@$ref_pk;
	@charge=@$ref_charge;
	#initializas some values
	$ph=6.5;
	$ph_prev=0;
	$ph_next=14;
	$E=0.001; #precision pi=ph+/-E
	for($i=0; $i<@aminoq; $i++){
    #counts each amminoacid
	  $number_aa[$i]=($protein=~s/$aminoq[$i]/$aminoq[$i]/g);
	}
	$flag_cond=(($ph-$ph_prev)<$E)&(($ph_next-$ph)<$E);
	for($i=0;$i<@aminoq;$i++){
		if(index(lc($aminoq[$i]),"c_term")>-1){
			$c_term=$i;	
		}
		if(index(lc($aminoq[$i]),"n_term")>-1){
			$n_term=$i;
		}	
	}
  #bisection method
  while($flag_cond==0){  
    $cr_cooh=10**($ph-$pk[$c_term]);
	$partial_cooh=(-1*$cr_cooh)/(1+$cr_cooh);
	$cr_nh2=10**($pk[$n_term]-$ph);
	$partial_nh2=$cr_nh2/(1+$cr_nh2);
    $total_charge=$partial_cooh+$partial_nh2;
	#partial charge
	for($i=0; $i<@aminoq; $i++){
		if($i!=$c_term && $i!=$n_term){
            if ($charge[$i]=~"-"){
     				$cr=10**($ph-$pk[$i]);
     				$partial=(-1*$cr)/(1+$cr);
    		}
    		else {
    				$cr=10**($pk[$i]-$ph);
    				$partial=$cr/(1+$cr);
    		}
    		$total_charge=$total_charge+($partial*$number_aa[$i]);
		}	
    }
    if($total_charge<0){
      $temp=$ph;
      $ph=$ph-(($ph-$ph_prev)/2);
      $ph_next=$temp;    
    }
    else{
      $temp=$ph;
      $ph=$ph+(($ph_next-$ph)/2);
      $ph_prev=$temp;       
    }
    $flag_cond=(($ph-$ph_prev)<$E)&(($ph_next-$ph)<$E);

  }
return $ph;
}

### subroutines end ###

############## PARAMETERS - BEGIN ##############
open(SET,"<settings.ini");
$n=0;
while(! eof SET){
  $row=<SET>;
  chomp($row);
  if(index($row,"#")!=0){
    $settings[$n]=$row;
    $n++;  
  }
}
$file_name   = $settings[1]; #fasta database
$organism    = $settings[2]; #organism of interest
$n_mc        = $settings[3]; #number of missed cleavages
$low_b       = $settings[4];  #lower bound of the interpretation range
$upp_b       = $settings[5]; #upper bound of the interpretation range

$flag_ptm    = "Y";              #PTMs Y or N

############## PARAMETERS - END #################

#current directory
$path_tool = cwd;
$path_tool = substr( $path_tool, 0, length($path_tool) - 4 );

#tool directories
$dir_data = $path_tool . '/data';
$dir_db   = $path_tool . '/db';
$dir_src  = $path_tool . '/src';
$dir_tmp  = $path_tool . '/tmp';

#name of the database to be created
$dir_name = "db_".time;
$nome_db=$dir_name;
#name of the file storing the list of proteins of the database
$protein_file = $dir_name . "prot.gz";
chdir($dir_tmp);



#$file_name="prova.fasta";
$dir_name = $dir_db . '/' . $dir_name;
if ( !-d $dir_name ) {
	mkdir($dir_name);
}
chdir($dir_src);

#ptm
if ( $flag_ptm eq "Y" ) {

	#read ptm table
	open( PTM, "<table_ptm.txt" );
	$k = 0;
	while ( !eof PTM ) {
		$ptm_file = <PTM>;
		chomp($ptm_file);
		if ( index( $ptm_file, "#" ) != 0 ) {
			@ptm = split( "\t", $ptm_file );
			$ptm_name[$k]      = $ptm[0];
			$ptm_name_old[$k]  = $ptm_name[$k];
			$ptm_aa[$k]        = $ptm[1];
			$ptm_weight_mn[$k] = $ptm[2];         #monoisotopic
			$ptm_weight_av[$k] = $ptm[3];         #average
			$flag_ptmtype[$k]  = $ptm[4];         #fixed or variable
			if ( $flag_ptmtype[$k] eq "V" ) {
				$flag_maxptm[$k] = 2;

			}
			else {
				$flag_maxptm[$k] = 0;
				$ptm_name[$k]    = $ptm_name[$k] . "fix";
				$hash_ptm_fix{$ptm_aa[$k]}=$k;
				
				
			}
			$k++;
		}
	}
	close(PTM);

	#number of PTMs available
	$ptm_more = @ptm_aa;
	chdir($dir_name);
	open( PTM, ">ptm_db.txt" );
	for ( $i = 0 ; $i < @ptm_name ; $i++ ) {
		print PTM
"$ptm_name_old[$i]\t$ptm_aa[$i]\t$ptm_weight_mn[$i]\t$ptm_weight_av[$i]\t$flag_ptmtype[$i]\n";
	}
	close(PTM);
	chdir($dir_src);

}
else {
	$ptm_more = 0;
}
$ptm_more2=$ptm_more;
#read the molecular weights table
open( AA, "<table_mw.txt" );
$k = 0;
while ( !eof AA ) {
	$aa_file = <AA>;
	chomp($aa_file);
	if ( index( $aa_file, "#" ) != 0 ) {
		@aa_mw = split( "\t", $aa_file );
		$aa[$k]    = $aa_mw[0];
		$mw_mn[$k] = $aa_mw[1];
		$mw_av[$k] = $aa_mw[2];

		#CAM	C	57.021464	57.0513	F
		#fixed PTM
		if ( exists($hash_ptm_fix{$aa[$k]})) {
			$mw_mn[$k] = $mw_mn[$k] + $ptm_weight_mn[$hash_ptm_fix{$aa[$k]}];
			$mw_av[$k] = $mw_av[$k] + $ptm_weight_av[$hash_ptm_fix{$aa[$k]}];
			$ptm_more2--;
			
		}
		$k++;
	}
}
close(AA);
$ref_aa    = \@aa;
$ref_mw_mn = \@mw_mn;
$ref_mw_av = \@mw_av;

#isoelectric point
open(PI, "<table_pi.txt");
$k=0;
while(! eof PI){
	$pk_string=<PI>;
	chomp($pk_string);
	if(index($pk_string,"#")!=0){
  	@aa_pk_charge=split("\t",$pk_string); 
  	$aminoq[$k]=$aa_pk_charge[0];
  	$pk[$k]=$aa_pk_charge[1];
  	$charge[$k]=$aa_pk_charge[2];
  	$k++;
	}
}
close(PI);
$ref_aminoq=\@aminoq;
$ref_pk=\@pk;
$ref_charge=\@charge;
chdir($dir_name);
open(READ,">read_file.txt");
$header_file=$header_file."\n";
print READ $header_file;
$header_file="Peptide database created by create_database\.pl as input only for phopshopeptide_ID\.pl\n";
print READ $header_file;
close(READ);


#PTM and MC combination
#@keys_all_N contains the keys of the hash for the N file
#the key is "mc" and then a set of bit (1 if the ptm is present, 0 else)
$combination = 2**$ptm_more;
$bit         = sprintf( "%b", ( $combination - 1 ) );    #the biggest number
$l_bit       = length($bit);
$q           = 0;
for ( $j = 0 ; $j < $combination ; $j++ ) {
	for ( $i = 0 ; $i < ( $n_mc + 1 ) ; $i++ ) {
		$bit = "";
		$bit = sprintf( "%b", $j );
		if ( length($bit) < $l_bit ) {
			$diff_l = $l_bit - length($bit);
			for ( $k = 0 ; $k < $diff_l ; $k++ ) {
				$bit = "0" . $bit;
			}
		}
		$keys_all_N[$q] = $i . $bit;
		$q++;
	}
}
@keys_all_N = sort { $a <=> $b } @keys_all_N;
##range file
chdir($dir_name);
open( NFILE, ">range_" . $low_b . "_" . $upp_b . ".txt" );
for ( $q = 0 ; $q < @keys_all_N ; $q++ ) {
	print NFILE "$keys_all_N[$q]\t";
}
print NFILE "\n";

if ( $ptm_more > 0 ) {
	$ptm_more = $ptm_more - 1;    #starts from 0
}

chdir($dir_db);
$PROT_FILE = new IO::Zlib;
$PROT_FILE->open( "$protein_file", "wb9" );
chdir($dir_tmp);
open( DB_IN, "<$file_name" );
$count_prot = -1;
$prot_proc  = 0;
$flag       = 0;
$e          = 0;
$r          = 0;
$flag_read  = 0;
$decoy      = 0;

while ( !eof DB_IN ) {
	$row = <DB_IN>;
    
	$row = substr( $row, 0, length($row) - 2 );    #chomp
	#$row =~ /(.+)(\n)$/;
	#$row = $1;
	
	
	if ( ( $prot_proc % 100000 ) == 0 ) {
		print "Rows processed: $prot_proc\n";
	}
	$prot_proc++;

	#header
	if ( $row eq "" ) {
		next;
	}
	if ( $row =~ /^>/ ) {
		
		if ( $flag == 1 && $org eq $organism ) {
			#buffer
			
			if ( $count_prot % 2000 == 0 && $count_prot > 0 ) {
				@sorted = "";
				pop(@sorted);
				@sorted =
				  sort { $hash_array{$a} <=> $hash_array{$b} } keys %hash_array;
				@keys_hash_array = keys(%hash_array);
				chdir($dir_name);
				for ( $i = 0 ; $i < @sorted ; $i++ ) {
					delete( $hash_array{ $keys_hash_array[$i] } );
					$row2       = $array_peptides[ $sorted[$i] ];
					@split_row2 = split( "\t", $row2 );
					$pep        = $split_row2[2];
					$hash_mass{$pep}++;
					$file = "";
					if ( $pep <= 9000 ) {
						if ( $pep <= 500 ) {
							$bin = 100;
							if ( $pep == 500 ) {
								$file = "500";
							}
						}
						elsif ( $pep <= 900 ) {
							$bin = 1;
							if ( $pep == 900 ) {
								$file = "900";
							}
						}
						elsif ( $pep <= 3000 ) {
							$bin = 0.5;
							if ( $pep == 3000 ) {
								$file = "3000";
							}
							else {
								$range =
								  $bin + ( $bin * ( int( $pep / $bin ) ) );
								$num = ( $range =~ s/\.5/\_5/ );
								if ( $num == 1 ) {
									$file = $range;
								}
								else {
									$file = $range . "_0";
								}
							}
						}
						elsif ( $pep <= 5000 ) {
							$bin = 1;
							if ( $pep == 5000 ) {
								$file = "5000";
							}
						}
						elsif ( $pep <= 8000 ) {
							$bin = 10;
							if ( $pep == 8000 ) {
								$file = "8000";
							}
						}
						elsif ( $pep <= 9000 ) {
							$bin = 100;
							if ( $pep == 9000 ) {
								$file = "9000";
							}
						}
						$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
						if ( $file eq "" ) {
							$file = $range;
						}
					}
					else {
						if ( $pep <= 10000 ) {
							$file = "10000";
						}
						elsif ( $pep <= 20000 ) {
							$file = "20000";
						}
						else {
							$file = "big_pep";
						}
					}
					open( FILE, ">>$file" );
					print FILE $row2;
					close(FILE);
				}
				@array_peptides = "";
				pop(@array_peptides);
				$e = 0;
				chdir($dir_src);
			}
            
			#ambiguos amino acid J, B, Z, X
			#J is replaced with L
			$protein =~ s/J/L/g;

			#if the protein has B, Z or X amino acids is discarded
			if ( $protein =~ /[XBZ]/ ) {
				goto "JUMP_PROT";
			}
			$count_prot++;
			$decoy_prot[$decoy] = reverse($protein);
			$decoy++;
			$n_aa = length($protein);    #number of amminoacids of this protein
			                             #computes mw
			                             #print "$accession $n_aa\n$protein\n";
			$MW   = 0;
			$MW = &molecular_weight( $protein, $ref_aa, $ref_mw_av, 0 );
			$PI=0;
    		$PI=&isoel_point($protein,$ref_aminoq,$ref_pk,$ref_charge);
    		$PI=sprintf "%.2f", $PI;
			#enzymatic digestion
			$m       = 0;
			$pos     = -1;                          #first aa in 0
			$stop    = -1;
			$first   = substr( $protein, 0, 2 );    #first 2 aa of the protein
			$l_prot  = $n_aa;
			$string  = $protein;
			$start   = 0;
			$flag_aa = 0;

			while ( $string =~
/(^\w*)([FLY][^P]|W[^MP]|M[^PY]|H[^DMPW])(\w*$)/ # modified rule			  
				)
			{
				$a        = $1;
				$b        = $2;
				$c        = $3;
				$length_b = length($b);

				#case: no one aa before the triple
				if ( length($a) == 0 && $m > 0 ) {
					$peptide = substr( $b, 0, 2 );
				}
				else {
					$peptide = $a . substr( $b, 0, 2 );
				}
				$peptide = substr( $peptide, $flag_aa, length($peptide) );
				$flag_aa = 1;

				#peptide position in the protein
				$pos += length($peptide);
				$old_stop = $stop;
				$stop     = $pos;

				#new string
				$string = substr( $b, $length_b - 2, 2 ) . $c;
				$m++;
				$start = $old_stop + 1;

				#writes db
				$mw_pep = &molecular_weight( $peptide, $ref_aa, $ref_mw_mn, 1 );
				$id = $m;
				if ( $n_mc == 0 && $flag_ptm eq "N" ) {
					$mw_pep = sprintf "%.4f", $mw_pep;
					$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
				}
				else {
					$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
				}
				
			}

			#last peptide of the protein
			$shift = $pos + 1;
			$m++;
			$peptide = substr( $protein, $shift, $l_prot - $pos - 1 );
			$start   = $stop + 1;
			$stop    = $l_prot - 1;
			$mw_pep  = &molecular_weight( $peptide, $ref_aa, $ref_mw_mn, 1 );
			$id      = $m;
			if ( $n_mc == 0 && $flag_ptm eq "N" ) {
				$mw_pep = sprintf "%.4f", $mw_pep;
				$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
			}
			else {
				$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
			}

			#mc
			if ( $n_mc > 0 ) {
				@keys = keys(%hash_prot);
				@keys_sort = sort { $a <=> $b } @keys;
				for ( $i = 0 ; $i < @keys_sort ; $i++ ) {
					$z = 0;
					for ( $z = 0 ; $z <= $n_mc ; $z++ ) {  
						$key_exist = exists( $keys_sort[ $i + $z ] );
						if ( ( @keys_sort - $i ) > 1 && $key_exist == 1 ) {
							$array_mc[$z] = $hash_prot{ $keys_sort[ $i + $z ] };
						}

						#$z++;
					}
					$start_mc = $array_mc[0];
					for ( $z = 1 ; $z <= $n_mc ; $z++ ) {    
						if ( $array_mc[$z] ne "" ) {
							@array_prot = split( "\t", $start_mc );
							$pep_id1    = $array_prot[0];
							$pep_mw1    = $array_prot[1];
							$peptide1   = $array_prot[2];
							$pep_start1 = $array_prot[3];

							$pep_mc     = $array_mc[$z];
							@array_prot = split( "\t", $pep_mc );
							$pep_id2    = $array_prot[0];
							$pep_mw2    = $array_prot[1];
							$peptide2   = $array_prot[2];
							$pep_stop2  = $array_prot[4];

							$new_id  = $pep_id1 . "_" . $pep_id2;
							$new_pep = $peptide1 . $peptide2;
							$new_mw  = $pep_mw1 + $pep_mw2 - 18.01056;
							if ( $flag_ptm eq "N" ) {
								$new_mw = sprintf "%.4f", $new_mw;
								$hash_prot{$new_id} =
"$new_id\t$new_mw\t$new_pep\t$pep_start1\t$pep_stop2";
							}
							else {
								$hash_prot{$new_id} =
"$new_id\t$new_mw\t$new_pep\t$pep_start1\t$pep_stop2";
							}
							$start_mc = $hash_prot{$new_id};
						}
					}
					@array_mc = "";
					pop(@array_mc);
				}
			}

			#PTM
			if ( $flag_ptm eq "Y" ) {
				$j = 0;
				while ( $j < $ptm_more + 1 ) {
					@keys      = keys(%hash_prot);
					@keys_sort = sort { $a <=> $b } @keys;
					$num_ptm   = $flag_maxptm[$j];
					$aa_mod    = $ptm_aa[$j];
					$name      = $ptm_name[$j];
					
					if($name=~/fix/){
						$j++;
						next; 
					}
					for ( $i = 0 ; $i < @keys_sort ; $i++ ) {
						$row_ptm      = $hash_prot{ $keys_sort[$i] };
						@array_row    = split( "\t", $row_ptm );
						$pep_id       = $array_row[0];
						$pep_mw       = $array_row[1];
						$peptide      = $array_row[2];
						$pep_start    = $array_row[3];
						$pep_stop     = $array_row[4];
						$peptide_temp = $peptide;

						if ( length($aa_mod) > 1 ) {
							$number_aa =
							  ( $peptide_temp =~ s/[$aa_mod]/[$aa_mod]/g );
						}
						else {
							$number_aa =
							  ( $peptide_temp =~ s/$aa_mod/$aa_mod/g );
						}
						if ( $number_aa eq "" ) {
							$number_aa = 0;
						}
						###############
						$num_ptm = $number_aa;    ################
						if ( $num_ptm > 4 ) {
							$num_ptm = 4;
						}
						################
						if ( $num_ptm != 0 ) {
							delete( $hash_prot{$pep_id} );
							$new_id = $pep_id . "_" . $name . "0";
							if ( $j == $ptm_more ) {
								$pep_mw = sprintf "%.4f", $pep_mw;
								$hash_prot{$new_id} =
"$new_id\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
							}
							else {
								$hash_prot{$new_id} =
"$new_id\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
							}
							if ( $number_aa > 0 ) {
								$new_id = $pep_id . "_" . $name . "1";
								$new_mw =
								  $pep_mw +
								  $ptm_weight_mn[$j];    # case 1 aa modified
								if ( $j == $ptm_more ) {
									$new_mw = sprintf "%.4f", $new_mw;
									$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								else {
									$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								if ( $number_aa > 1 ) {
									$count_ptm = 2;
									while ($count_ptm <= $num_ptm
										&& $count_ptm <= $number_aa )
									{
										$new_id =
										  $pep_id . "_" . $name . $count_ptm;
										$new_mw = $pep_mw +
										  ( $count_ptm * $ptm_weight_mn[$j] )
										  ;    # case 2 and more aa modified
										if ( $j == $ptm_more ) {
											$new_mw = sprintf "%.4f", $new_mw;
											$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
										}
										else {
											$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
										}
										$count_ptm++;
									}    #while
								}    #if($number_aa>1)
							}    #if($number_aa>0)
						}    #if($num_ptm!=0)
						else {
							delete( $hash_prot{$pep_id} );
							if ( $number_aa == 0 ) {
								$pep_id2 = $pep_id . "_" . $name . "0";
								if ( $j == $ptm_more ) {
									$new_mw = sprintf "%.4f", $pep_mw;
									$hash_prot{$pep_id2} =
"$pep_id2\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								else {
									$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
								}
							}
							else {

								#$number_aa>0
								$pep_id2 = $pep_id . "_" . $name . "*";
								$pep_id  = $pep_id . "_" . $name . $number_aa;
								$new_mw =
								  $pep_mw + ( $number_aa * $ptm_weight_mn[$j] );
								if ( $j == $ptm_more ) {
									$new_mw = sprintf "%.4f", $new_mw;
									$hash_prot{$pep_id} =
"$pep_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
									$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								else {
									$hash_prot{$pep_id} =
"$pep_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
									$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
								}

							}
						}    #else
					}    #for($i=0; $i<@keys_sort; $i++)
					$j++;
				}    # while
			}    #if flag_ptm
			$MW        = sprintf "%.2f", $MW;
			@keys      = keys(%hash_prot);
			@keys_sort = sort { $a <=> $b } @keys;
			$tot_pep   = @keys_sort;

			#new file
			$header =
			"$accession\t$iden\t$organism\t$description\t$MW\t$PI\t$n_aa\n";
			print $PROT_FILE $header;
			for ( $s = 0 ; $s < @keys_sort ; $s++ ) {
				$string_prot    = $hash_prot{ $keys_sort[$s] };
				@split_prot     = split( "\t", $string_prot );
				
				$hash_array{$e} = $split_prot[1];
				$array_peptides[$e] =
				  $count_prot . "\t" . $hash_prot{ $keys_sort[$s] } . "\n";
				$mc               = 0;
				$keys_hash_N_temp = "";
				$start_id         = 1;
				@split_id         = split( "_", $keys_sort[$s] );
				$flag_star        = 0;
				$n_split          = @split_id;

				if ( $n_split == 1 ) {
					$mc               = 0;
					$keys_hash_N_temp = "00";
				}
				else {
					for ( $d = 0 ; $d < $n_split ; $d++ ) {
						if ( $split_id[$d] =~ /^\d+/ ) {
							$mc = $d;
						}
					}
					$keys_hash_N_temp = $mc;
					if ( $flag_ptm eq "Y" ) {
						$start_id = $mc + 1;
						for ( $d = $start_id ; $d < @split_id ; $d++ ) {
							$split_id[$d] =~ /([a-zA-Z]+)([0-9]+|\*{1})/;
							$mod   = $1;
							$n_mod = $2;
							if ( $n_mod == 0 && $n_mod ne "\*" ) {
								$keys_hash_N_temp = $keys_hash_N_temp . "0";
							}
							elsif ( $n_mod > 0 ) {
								$keys_hash_N_temp = $keys_hash_N_temp . "1";
							}
							else {
								$keys_hash_N_temp = $keys_hash_N_temp . "0";
								$flag_star        = 1;
							}
						}
					}    #if($flag_ptm eq "Y")
					else {
						$keys_hash_N_temp = $keys_hash_N_temp . "0";
					}
				}    #else
				if (   $split_prot[1] >= $low_b
					&& $split_prot[1] <= $upp_b
					&& $flag_star == 0 )
				{
					$hash_tableN{$keys_hash_N_temp} =
					  $hash_tableN{$keys_hash_N_temp} + 1;
				}
				$hash_count_pep{ $split_prot[1] } =
				  $hash_count_pep{ $split_prot[1] } + 1;
				$e++;
				delete $hash_prot{ $keys_sort[$s] };
			}    #for $s
			for ( $q = 0 ; $q < @keys_all_N ; $q++ ) {
				
				if ( $hash_tableN{ $keys_all_N[$q] } eq "" ) {
					$hash_tableN{ $keys_all_N[$q] } = 0;
				}
				print NFILE $hash_tableN{ $keys_all_N[$q] } . "\t";
				delete( $hash_tableN{ $keys_all_N[$q] } );
			}
			print NFILE "\n";
		}    #if $org eq $organism
	  JUMP_PROT:
		#>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3
		#$row=~/(^>sp)(\|{1})(\w+)(\|{1})(.[^\s]+)(\s{1})(.+)(OS=)(\w[^GN=|PE=]+)/;
		$row=~/(^>\w{2})(\|{1})(\w+\-{0,1}\d{0,1})(\|{1})(.[^\s]+)(\s{1})(.+)(OS=)(\w[^GN=|PE=]+)/;
		$accession=$3;
   		$iden=$5;
   		$description=$7;
   		$org=$9;
   		#print "$accession ID $iden DESCR $description ORG $org\n";
		
		if ( index( $org, "GN" ) > -1 ) {
			@split_org = split( "GN", $org );
			$org = $split_org[0];
		}

		#\s
		if ( $org =~ /(\s+)$/ ) {
			$space = length($1);
			$org = substr( $org, 0, length($org) - $space );
		}
		$protein = "";    #erase protein string
	}
	else {
		
		$protein=$protein.$row;
		
		$flag = 1;
	}
}

#last protein

if ( $flag == 1 && $org eq $organism ) {
	#$count_prot++;
	#ambiguos amino acid J, B, Z, X
	#J is replaced with L
	$protein =~ s/J/L/g;

	#if the protein has B, Z or X amino acids is discarded
	if ( $protein =~ /[XBZ]/ ) {
		goto "JUMP_PROT2";
	}
	$count_prot++;
	$decoy_prot[$decoy] = reverse($protein);
	$decoy++;
	$n_aa = length($protein);    #number of amminoacids of this protein
	                             #computes mw
	                             #print "$accession $n_aa\n$protein\n";

	$MW = 0;
	$MW = &molecular_weight( $protein, $ref_aa, $ref_mw_av, 0 );
	$PI=0;
    $PI=&isoel_point($protein,$ref_aminoq,$ref_pk,$ref_charge);
    $PI=sprintf "%.2f", $PI;
			
	#enzymatic digestion
	$m       = 0;
	$pos     = -1;                          #first aa in 0
	$stop    = -1;
	$first   = substr( $protein, 0, 2 );    #first 2 aa of the protein
	$l_prot  = $n_aa;
	$string  = $protein;
	$start   = 0;
	$flag_aa = 0;

	while ( $string =~
/(^\w*)([FLY][^P]|W[^MP]|M[^PY]|H[^DMPW])(\w*$)/ # modified rule
	  )
	{
		$a        = $1;
		$b        = $2;
		$c        = $3;
		$length_b = length($b);

		#case: no one aa before the triple
		if ( length($a) == 0 && $m > 0 ) {
			$peptide = substr( $b, 0, 2 );
		}
		else {
			$peptide = $a . substr( $b, 0, 2 );
		}
		$peptide = substr( $peptide, $flag_aa, length($peptide) );
		$flag_aa = 1;

		#peptide position in the protein
		$pos += length($peptide);
		$old_stop = $stop;
		$stop     = $pos;

		#new string
		$string = substr( $b, $length_b - 2, 2 ) . $c;
		$m++;
		$start = $old_stop + 1;

		#writes db
		$mw_pep = &molecular_weight( $peptide, $ref_aa, $ref_mw_mn, 1 );
		$id = $m;
		if ( $n_mc == 0 && $flag_ptm eq "N" ) {
			$mw_pep = sprintf "%.4f", $mw_pep;
			$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
		}
		else {
			$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
		}
	}

	#last peptide of the protein
	$shift = $pos + 1;
	$m++;
	$peptide = substr( $protein, $shift, $l_prot - $pos - 1 );
	$start   = $stop + 1;
	$stop    = $l_prot - 1;
	$mw_pep  = &molecular_weight( $peptide, $ref_aa, $ref_mw_mn, 1 );
	$id      = $m;
	if ( $n_mc == 0 && $flag_ptm eq "N" ) {
		$mw_pep = sprintf "%.4f", $mw_pep;
		$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
	}
	else {
		$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
	}

	#mc
	if ( $n_mc > 0 ) {
		@keys = keys(%hash_prot);
		@keys_sort = sort { $a <=> $b } @keys;
		for ( $i = 0 ; $i < @keys_sort ; $i++ ) {
			$z = 0;
			for ( $z = 0 ; $z <= $n_mc ; $z++ ) {    
				$key_exist = exists( $keys_sort[ $i + $z ] );
				if ( ( @keys_sort - $i ) > 1 && $key_exist == 1 ) {
					$array_mc[$z] = $hash_prot{ $keys_sort[ $i + $z ] };
				}

				#$z++;
			}
			$start_mc = $array_mc[0];
			for ( $z = 1 ; $z <= $n_mc ; $z++ ) {    
				if ( $array_mc[$z] ne "" ) {
					@array_prot = split( "\t", $start_mc );
					$pep_id1    = $array_prot[0];
					$pep_mw1    = $array_prot[1];
					$peptide1   = $array_prot[2];
					$pep_start1 = $array_prot[3];

					$pep_mc     = $array_mc[$z];
					@array_prot = split( "\t", $pep_mc );
					$pep_id2    = $array_prot[0];
					$pep_mw2    = $array_prot[1];
					$peptide2   = $array_prot[2];
					$pep_stop2  = $array_prot[4];

					$new_id  = $pep_id1 . "_" . $pep_id2;
					$new_pep = $peptide1 . $peptide2;
					$new_mw  = $pep_mw1 + $pep_mw2 - 18.01056;
					if ( $flag_ptm eq "N" ) {
						$new_mw = sprintf "%.4f", $new_mw;
						$hash_prot{$new_id} =
						  "$new_id\t$new_mw\t$new_pep\t$pep_start1\t$pep_stop2";
					}
					else {
						$hash_prot{$new_id} =
						  "$new_id\t$new_mw\t$new_pep\t$pep_start1\t$pep_stop2";
					}
					$start_mc = $hash_prot{$new_id};
				}
			}
			@array_mc = "";
			pop(@array_mc);
		} #for i
	} #n_mc

	#PTM
	if ( $flag_ptm eq "Y" ) {
		$j = 0;
		while ( $j < $ptm_more + 1 ) {
			@keys      = keys(%hash_prot);
			@keys_sort = sort { $a <=> $b } @keys;
			$num_ptm   = $flag_maxptm[$j];
			$aa_mod    = $ptm_aa[$j];
			$name      = $ptm_name[$j];
			if($name=~/fix/){
				$j++;
				next; 
			}
			for ( $i = 0 ; $i < @keys_sort ; $i++ ) {
				$row_ptm      = $hash_prot{ $keys_sort[$i] };
				@array_row    = split( "\t", $row_ptm );
				$pep_id       = $array_row[0];
				$pep_mw       = $array_row[1];
				$peptide      = $array_row[2];
				$pep_start    = $array_row[3];
				$pep_stop     = $array_row[4];
				$peptide_temp = $peptide;

				if ( length($aa_mod) > 1 ) {
					$number_aa = ( $peptide_temp =~ s/[$aa_mod]/[$aa_mod]/g );
				}
				else {
					$number_aa = ( $peptide_temp =~ s/$aa_mod/$aa_mod/g );
				}
				if ( $number_aa eq "" ) {
					$number_aa = 0;
				}
				###############
				$num_ptm = $number_aa;    ################
				if ( $num_ptm > 4 ) {
					$num_ptm = 4;
				}
				################
				if ( $num_ptm != 0 ) {
					delete( $hash_prot{$pep_id} );
					$new_id = $pep_id . "_" . $name . "0";
					if ( $j == $ptm_more ) {
						$pep_mw = sprintf "%.4f", $pep_mw;
						$hash_prot{$new_id} =
						  "$new_id\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
					}
					else {
						$hash_prot{$new_id} =
						  "$new_id\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
					}
					if ( $number_aa > 0 ) {
						$new_id = $pep_id . "_" . $name . "1";
						$new_mw =
						  $pep_mw + $ptm_weight_mn[$j];    # case 1 aa modified
						if ( $j == $ptm_more ) {
							$new_mw = sprintf "%.4f", $new_mw;
							$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						else {
							$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						if ( $number_aa > 1 ) {
							$count_ptm = 2;
							while ($count_ptm <= $num_ptm
								&& $count_ptm <= $number_aa )
							{
								$new_id = $pep_id . "_" . $name . $count_ptm;
								$new_mw = $pep_mw +
								  ( $count_ptm * $ptm_weight_mn[$j] )
								  ;    # case 2 and more aa modified
								if ( $j == $ptm_more ) {
									$new_mw = sprintf "%.4f", $new_mw;
									$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								else {
									$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								$count_ptm++;
							}    #while
						}    #if($number_aa>1)
					}    #if($number_aa>0)
				}    #if($num_ptm!=0)
				else {
					delete( $hash_prot{$pep_id} );
					if ( $number_aa == 0 ) {
						$pep_id2 = $pep_id . "_" . $name . "0";
						if ( $j == $ptm_more ) {
							$new_mw = sprintf "%.4f", $pep_mw;
							$hash_prot{$pep_id2} =
"$pep_id2\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						else {
							$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
						}
					}
					else {

						#$number_aa>0
						$pep_id2 = $pep_id . "_" . $name . "*";
						$pep_id  = $pep_id . "_" . $name . $number_aa;
						$new_mw = $pep_mw + ( $number_aa * $ptm_weight_mn[$j] );
						if ( $j == $ptm_more ) {
							$new_mw = sprintf "%.4f", $new_mw;
							$hash_prot{$pep_id} =
"$pep_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
							$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						else {
							$hash_prot{$pep_id} =
"$pep_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
							$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
						}

					}
				}    #else
			}    #for($i=0; $i<@keys_sort; $i++)
			$j++;
		}    #while
	}    #if flag_ptm
	$MW        = sprintf "%.2f", $MW;
	@keys      = keys(%hash_prot);
	@keys_sort = sort { $a <=> $b } @keys;
	$tot_pep   = @keys_sort;

	#new file
	$header = "$accession\t$iden\t$organism\t$description\t$MW\t$PI\t$n_aa\n";
	print $PROT_FILE $header;
	for ( $s = 0 ; $s < @keys_sort ; $s++ ) {
		
		$string_prot    = $hash_prot{ $keys_sort[$s] };
		@split_prot     = split( "\t", $string_prot );
		$hash_array{$e} = $split_prot[1];
		$array_peptides[$e] =
		  $count_prot . "\t" . $hash_prot{ $keys_sort[$s] } . "\n";
		$mc               = 0;
		$keys_hash_N_temp = "";
		$start_id         = 1;
		@split_id         = split( "_", $keys_sort[$s] );
		$flag_star        = 0;
		$n_split          = @split_id;

		if ( $n_split == 1 ) {
			$mc               = 0;
			$keys_hash_N_temp = "00";
		}
		else {
			for ( $d = 0 ; $d < $n_split ; $d++ ) {
				if ( $split_id[$d] =~ /^\d+/ ) {
					$mc = $d;
				}
			}
			$keys_hash_N_temp = $mc;
			if ( $flag_ptm eq "Y" ) {
				$start_id = $mc + 1;
				for ( $d = $start_id ; $d < @split_id ; $d++ ) {
					$split_id[$d] =~ /([a-zA-Z]+)([0-9]+|\*{1})/;
					$mod   = $1;
					$n_mod = $2;
					if ( $n_mod == 0 && $n_mod ne "\*" ) {
						$keys_hash_N_temp = $keys_hash_N_temp . "0";
					}
					elsif ( $n_mod > 0 ) {
						$keys_hash_N_temp = $keys_hash_N_temp . "1";
					}
					else {
						$keys_hash_N_temp = $keys_hash_N_temp . "0";
						$flag_star        = 1;
					}
				}
			}    #if($flag_ptm eq "Y")
			else {
				$keys_hash_N_temp = $keys_hash_N_temp . "0";
			}
		}    #else
		if (   $split_prot[1] >= $low_b
			&& $split_prot[1] <= $upp_b
			&& $flag_star == 0 )
		{
			$hash_tableN{$keys_hash_N_temp} =
			  $hash_tableN{$keys_hash_N_temp} + 1;
		}
		$hash_count_pep{ $split_prot[1] } =
		  $hash_count_pep{ $split_prot[1] } + 1;
		$e++;
		delete $hash_prot{ $keys_sort[$s] };
	}    #for $s
	for ( $q = 0 ; $q < @keys_all_N ; $q++ ) {
		
		if ( $hash_tableN{ $keys_all_N[$q] } eq "" ) {
			$hash_tableN{ $keys_all_N[$q] } = 0;
		}
		print NFILE $hash_tableN{ $keys_all_N[$q] } . "\t";
		delete( $hash_tableN{ $keys_all_N[$q] } );
	}
	print NFILE "\n";
}    #if $org eq $organism

JUMP_PROT2:
close(DB_IN);

#buffer
@sorted = "";
pop(@sorted);
@sorted = sort { $hash_array{$a} <=> $hash_array{$b} } keys %hash_array;
@keys_hash_array = keys(%hash_array);
chdir($dir_name);
for ( $i = 0 ; $i < @sorted ; $i++ ) {
	delete( $hash_array{ $keys_hash_array[$i] } );
	$row2       = $array_peptides[ $sorted[$i] ];
	@split_row2 = split( "\t", $row2 );
	$pep        = $split_row2[2];
	$hash_mass{$pep}++;
	$file = "";
	if ( $pep <= 9000 ) {
		if ( $pep <= 500 ) {
			$bin = 100;
			if ( $pep == 500 ) {
				$file = "500";
			}
		}
		elsif ( $pep <= 900 ) {
			$bin = 1;
			if ( $pep == 900 ) {
				$file = "900";
			}
		}
		elsif ( $pep <= 3000 ) {
			$bin = 0.5;
			if ( $pep == 3000 ) {
				$file = "3000";
			}
			else {
				$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
				$num = ( $range =~ s/\.5/\_5/ );
				if ( $num == 1 ) {
					$file = $range;
				}
				else {
					$file = $range . "_0";
				}
			}
		}
		elsif ( $pep <= 5000 ) {
			$bin = 1;
			if ( $pep == 5000 ) {
				$file = "5000";
			}
		}
		elsif ( $pep <= 8000 ) {
			$bin = 10;
			if ( $pep == 8000 ) {
				$file = "8000";
			}
		}
		elsif ( $pep <= 9000 ) {
			$bin = 100;
			if ( $pep == 9000 ) {
				$file = "9000";
			}
		}
		$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
		if ( $file eq "" ) {
			$file = $range;
		}
	}
	else {
		if ( $pep <= 10000 ) {
			$file = "10000";
		}
		elsif ( $pep <= 20000 ) {
			$file = "20000";
		}
		else {
			$file = "big_pep";
		}
	}
	open( FILE, ">>$file" );
	print FILE $row2;
	close(FILE);
}
@array_peptides = "";
pop(@array_peptides);
$e = 0;

$count_decoy = -1;

#decoy (random) database
print "RANDOM\n";

for ( $bbb = 0 ; $bbb < @decoy_prot ; $bbb++ ) {
	$protein = $decoy_prot[$bbb];

	#if($row=~/^>/ ){
	if ( $count_decoy % 100 == 0 ) {
		@sorted = "";
		pop(@sorted);
		@sorted = sort { $hash_array{$a} <=> $hash_array{$b} } keys %hash_array;
		@keys_hash_array = keys(%hash_array);
		chdir($dir_name);
		for ( $i = 0 ; $i < @sorted ; $i++ ) {
			delete( $hash_array{ $keys_hash_array[$i] } );
			$row2       = $array_peptides[ $sorted[$i] ];
			@split_row2 = split( "\t", $row2 );
			$pep        = $split_row2[2];
			$hash_mass{$pep}++;
			$file = "";
			if ( $pep <= 9000 ) {
				if ( $pep <= 500 ) {
					$bin = 100;
					if ( $pep == 500 ) {
						$file = "500";
					}
				}
				elsif ( $pep <= 900 ) {
					$bin = 1;
					if ( $pep == 900 ) {
						$file = "900";
					}
				}
				elsif ( $pep <= 3000 ) {
					$bin = 0.5;
					if ( $pep == 3000 ) {
						$file = "3000";
					}
					else {
						$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
						$num = ( $range =~ s/\.5/\_5/ );
						if ( $num == 1 ) {
							$file = $range;
						}
						else {
							$file = $range . "_0";
						}
					}
				}
				elsif ( $pep <= 5000 ) {
					$bin = 1;
					if ( $pep == 5000 ) {
						$file = "5000";
					}
				}
				elsif ( $pep <= 8000 ) {
					$bin = 10;
					if ( $pep == 8000 ) {
						$file = "8000";
					}
				}
				elsif ( $pep <= 9000 ) {
					$bin = 100;
					if ( $pep == 9000 ) {
						$file = "9000";
					}
				}
				$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
				if ( $file eq "" ) {
					$file = $range;
				}
			}
			else {
				if ( $pep <= 10000 ) {
					$file = "10000";
				}
				elsif ( $pep <= 20000 ) {
					$file = "20000";
				}
				else {
					$file = "big_pep";
				}
			}
			open( FILE, ">>$file" );
			print FILE $row2;
			close(FILE);
			$n_pep_decoy++;
		}
		@array_peptides = "";
		pop(@array_peptides);
		$e = 0;
		chdir($dir_src);
	}

	#ambiguos amino acid J, B, Z, X
	#J is replaced with L
	$protein =~ s/J/L/g;
	$count_decoy++;
	$n_aa = length($protein);    #number of amminoacids of this protein
	                             #computes mw
	                             #print "$accession $n_aa\n$protein\n";

	$MW = 0;
	$MW = &molecular_weight( $protein, $ref_aa, $ref_mw_av, 0 );
	$PI=0;
    $PI=&isoel_point($protein,$ref_aminoq,$ref_pk,$ref_charge);
    $PI=sprintf "%.2f", $PI;
			
	#enzymatic digestion
	$m       = 0;
	$pos     = -1;                          #first aa in 0
	$stop    = -1;
	$first   = substr( $protein, 0, 2 );    #first 2 aa of the protein
	$l_prot  = $n_aa;
	$string  = $protein;
	$start   = 0;
	$flag_aa = 0;


	while ( $string =~
/(^\w*)([FLY][^P]|W[^MP]|M[^PY]|H[^DMPW])(\w*$)/ # modified rule
	  )
	{
		$a        = $1;
		$b        = $2;
		$c        = $3;
		$length_b = length($b);

		#case: no one aa before the triple
		if ( length($a) == 0 && $m > 0 ) {
			$peptide = substr( $b, 0, 2 );
		}
		else {
			$peptide = $a . substr( $b, 0, 2 );
		}
		$peptide = substr( $peptide, $flag_aa, length($peptide) );
		$flag_aa = 1;

		#peptide position in the protein
		$pos += length($peptide);
		$old_stop = $stop;
		$stop     = $pos;

		#new string
		$string = substr( $b, $length_b - 2, 2 ) . $c;	
		$m++;
		$start = $old_stop + 1;

		#writes db
		$mw_pep = &molecular_weight( $peptide, $ref_aa, $ref_mw_mn, 1 );
		$id = $m;
		if ( $n_mc == 0 && $flag_ptm eq "N" ) {
			$mw_pep = sprintf "%.4f", $mw_pep;
			$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
		}
		else {
			$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
		}
	}

	#last peptide of the protein
	$shift = $pos + 1;
	$m++;
	$peptide = substr( $protein, $shift, $l_prot - $pos - 1 );
	$start   = $stop + 1;
	$stop    = $l_prot - 1;
	$mw_pep  = &molecular_weight( $peptide, $ref_aa, $ref_mw_mn, 1 );
	$id      = $m;
	if ( $n_mc == 0 && $flag_ptm eq "N" ) {
		$mw_pep = sprintf "%.4f", $mw_pep;
		$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
	}
	else {
		$hash_prot{$m} = "$id\t$mw_pep\t$peptide\t$start\t$stop";
	}

	#mc
	if ( $n_mc > 0 ) {
		@keys = keys(%hash_prot);
		@keys_sort = sort { $a <=> $b } @keys;
		for ( $i = 0 ; $i < @keys_sort ; $i++ ) {
			$z = 0;
			for ( $z = 0 ; $z <= $n_mc ; $z++ ) {    
				$key_exist = exists( $keys_sort[ $i + $z ] );
				if ( ( @keys_sort - $i ) > 1 && $key_exist == 1 ) {
					$array_mc[$z] = $hash_prot{ $keys_sort[ $i + $z ] };
				}

				#$z++;
			}
			$start_mc = $array_mc[0];
			for ( $z = 1 ; $z <= $n_mc ; $z++ ) {    
				if ( $array_mc[$z] ne "" ) {
					@array_prot = split( "\t", $start_mc );
					$pep_id1    = $array_prot[0];
					$pep_mw1    = $array_prot[1];
					$peptide1   = $array_prot[2];
					$pep_start1 = $array_prot[3];

					$pep_mc     = $array_mc[$z];
					@array_prot = split( "\t", $pep_mc );
					$pep_id2    = $array_prot[0];
					$pep_mw2    = $array_prot[1];
					$peptide2   = $array_prot[2];
					$pep_stop2  = $array_prot[4];

					$new_id  = $pep_id1 . "_" . $pep_id2;
					$new_pep = $peptide1 . $peptide2;
					$new_mw  = $pep_mw1 + $pep_mw2 - 18.01056;
					if ( $flag_ptm eq "N" ) {
						$new_mw = sprintf "%.4f", $new_mw;
						$hash_prot{$new_id} =
						  "$new_id\t$new_mw\t$new_pep\t$pep_start1\t$pep_stop2";
					}
					else {
						$hash_prot{$new_id} =
						  "$new_id\t$new_mw\t$new_pep\t$pep_start1\t$pep_stop2";
					}
					$start_mc = $hash_prot{$new_id};
				}
			}
			@array_mc = "";
			pop(@array_mc);
		}
	}

	#PTM
	if ( $flag_ptm eq "Y" ) {
		$j = 0;
		while ( $j < $ptm_more + 1 ) {
			@keys      = keys(%hash_prot);
			@keys_sort = sort { $a <=> $b } @keys;
			$num_ptm   = $flag_maxptm[$j];
			$aa_mod    = $ptm_aa[$j];
			$name      = $ptm_name[$j];
			if($name=~/fix/){
				$j++;
				next; 
			}
			for ( $i = 0 ; $i < @keys_sort ; $i++ ) {
				$row_ptm      = $hash_prot{ $keys_sort[$i] };
				@array_row    = split( "\t", $row_ptm );
				$pep_id       = $array_row[0];
				$pep_mw       = $array_row[1];
				$peptide      = $array_row[2];
				$pep_start    = $array_row[3];
				$pep_stop     = $array_row[4];
				$peptide_temp = $peptide;

				if ( length($aa_mod) > 1 ) {
					$number_aa = ( $peptide_temp =~ s/[$aa_mod]/[$aa_mod]/g );
				}
				else {
					$number_aa = ( $peptide_temp =~ s/$aa_mod/$aa_mod/g );
				}
				if ( $number_aa eq "" ) {
					$number_aa = 0;
				}
				###############
				$num_ptm = $number_aa;    ################
				if ( $num_ptm > 4 ) {
					$num_ptm = 4;
				}
				################
				if ( $num_ptm != 0 ) {
					delete( $hash_prot{$pep_id} );
					$new_id = $pep_id . "_" . $name . "0";
					if ( $j == $ptm_more ) {
						$pep_mw = sprintf "%.4f", $pep_mw;
						$hash_prot{$new_id} =
						  "$new_id\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
					}
					else {
						$hash_prot{$new_id} =
						  "$new_id\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
					}
					if ( $number_aa > 0 ) {
						$new_id = $pep_id . "_" . $name . "1";
						$new_mw =
						  $pep_mw + $ptm_weight_mn[$j];    # case 1 aa modified
						if ( $j == $ptm_more ) {
							$new_mw = sprintf "%.4f", $new_mw;
							$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						else {
							$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						if ( $number_aa > 1 ) {
							$count_ptm = 2;
							while ($count_ptm <= $num_ptm
								&& $count_ptm <= $number_aa )
							{
								$new_id = $pep_id . "_" . $name . $count_ptm;
								$new_mw = $pep_mw +
								  ( $count_ptm * $ptm_weight_mn[$j] )
								  ;    # case 2 and more aa modified
								if ( $j == $ptm_more ) {
									$new_mw = sprintf "%.4f", $new_mw;
									$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								else {
									$hash_prot{$new_id} =
"$new_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
								}
								$count_ptm++;
							}    #while
						}    #if($number_aa>1)
					}    #if($number_aa>0)
				}    #if($num_ptm!=0)
				else {
					delete( $hash_prot{$pep_id} );
					if ( $number_aa == 0 ) {
						$pep_id2 = $pep_id . "_" . $name . "0";
						if ( $j == $ptm_more ) {
							$new_mw = sprintf "%.4f", $pep_mw;
							$hash_prot{$pep_id2} =
"$pep_id2\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						else {
							$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
						}
					}
					else {

						#$number_aa>0
						$pep_id2 = $pep_id . "_" . $name . "*";
						$pep_id  = $pep_id . "_" . $name . $number_aa;
						$new_mw = $pep_mw + ( $number_aa * $ptm_weight_mn[$j] );
						if ( $j == $ptm_more ) {
							$new_mw = sprintf "%.4f", $new_mw;
							$hash_prot{$pep_id} =
"$pep_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
							$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
						}
						else {
							$hash_prot{$pep_id} =
"$pep_id\t$new_mw\t$peptide\t$pep_start\t$pep_stop";
							$hash_prot{$pep_id2} =
"$pep_id2\t$pep_mw\t$peptide\t$pep_start\t$pep_stop";
						}

					}
				}    #else
			}    #for($i=0; $i<@keys_sort; $i++)
			$j++;
		}    #while
	}    #if flag_ptm
	$MW        = sprintf "%.2f", $MW;
	@keys      = keys(%hash_prot);
	@keys_sort = sort { $a <=> $b } @keys;
	$tot_pep   = @keys_sort;
	#new file
	$header ="Decoy_" . $count_decoy . "\t-\t-\tDecoy protein\t$MW\t$PI\t$n_aa";
	print $PROT_FILE $header . "\n";
	for ( $s = 0 ; $s < @keys_sort ; $s++ ) {
		$string_prot = $hash_prot{ $keys_sort[$s] };

		#print "$string_prot\n";
		@split_prot = split( "\t", $string_prot );
		$hash_array{$e} = $split_prot[1];
		$array_peptides[$e] =
		  "Decoy_" . $count_decoy . "\t" . $hash_prot{ $keys_sort[$s] } . "\n";
		$mc               = 0;
		$keys_hash_N_temp = "";
		$start_id         = 1;
		@split_id         = split( "_", $keys_sort[$s] );
		$flag_star        = 0;
		$n_split          = @split_id;
		if ( $n_split == 1 ) {
			$mc               = 0;
			$keys_hash_N_temp = "00";
		}
		else {
			for ( $d = 0 ; $d < $n_split ; $d++ ) {
				if ( $split_id[$d] =~ /^\d+/ ) {
					$mc = $d;
				}
			}
			$keys_hash_N_temp = $mc;
			if ( $flag_ptm eq "Y" ) {
				$start_id = $mc + 1;
				for ( $d = $start_id ; $d < @split_id ; $d++ ) {
					$split_id[$d] =~ /([a-zA-Z]+)([0-9]+|\*{1})/;
					$mod   = $1;
					$n_mod = $2;
					if ( $n_mod == 0 && $n_mod ne "\*" ) {
						$keys_hash_N_temp = $keys_hash_N_temp . "0";
					}
					elsif ( $n_mod > 0 ) {
						$keys_hash_N_temp = $keys_hash_N_temp . "1";
					}
					else {
						$keys_hash_N_temp = $keys_hash_N_temp . "0";
						$flag_star        = 1;
					}

				}
			}
			else {
				$keys_hash_N_temp = $keys_hash_N_temp . "0";
			}
		}

		if (   $split_prot[1] >= $low_b
			&& $split_prot[1] <= $upp_b
			&& $flag_star == 0 )
		{
			$hash_tableN{$keys_hash_N_temp} =
			  $hash_tableN{$keys_hash_N_temp} + 1;
		}
		$hash_count_pep{ $split_prot[1] } =
        $hash_count_pep{ $split_prot[1] } + 1;
		$e++;
		delete $hash_prot{ $keys_sort[$s] };
	}    # for s
	for ( $q = 0 ; $q < @keys_all_N ; $q++ ) {
		if ( $hash_tableN{ $keys_all_N[$q] } eq "" ) {
			$hash_tableN{ $keys_all_N[$q] } = 0;
		}
		print NFILE $hash_tableN{ $keys_all_N[$q] } . "\t";
		delete( $hash_tableN{ $keys_all_N[$q] } );
	}
	print NFILE "\n";

	#}
}

#last proteins
@sorted = "";
pop(@sorted);
@sorted = sort { $hash_array{$a} <=> $hash_array{$b} } keys %hash_array;
@keys_hash_array = keys(%hash_array);
chdir($dir_name);
for ( $i = 0 ; $i < @sorted ; $i++ ) {
	delete( $hash_array{ $keys_hash_array[$i] } );
	$row2       = $array_peptides[ $sorted[$i] ];
	@split_row2 = split( "\t", $row2 );
	$pep        = $split_row2[2];
	$hash_mass{$pep}++;
	$file = "";
	if ( $pep <= 9000 ) {
		if ( $pep <= 500 ) {
			$bin = 100;
			if ( $pep == 500 ) {
				$file = "500";
			}
		}
		elsif ( $pep <= 900 ) {
			$bin = 1;
			if ( $pep == 900 ) {
				$file = "900";
			}
		}
		elsif ( $pep <= 3000 ) {
			$bin = 0.5;
			if ( $pep == 3000 ) {
				$file = "3000";
			}
			else {
				$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
				$num = ( $range =~ s/\.5/\_5/ );
				if ( $num == 1 ) {
					$file = $range;
				}
				else {
					$file = $range . "_0";
				}
			}
		}
		elsif ( $pep <= 5000 ) {
			$bin = 1;
			if ( $pep == 5000 ) {
				$file = "5000";
			}
		}
		elsif ( $pep <= 8000 ) {
			$bin = 10;
			if ( $pep == 8000 ) {
				$file = "8000";
			}
		}
		elsif ( $pep <= 9000 ) {
			$bin = 100;
			if ( $pep == 9000 ) {
				$file = "9000";
			}
		}
		$range = $bin + ( $bin * ( int( $pep / $bin ) ) );
		if ( $file eq "" ) {
			$file = $range;
		}
	}
	else {
		if ( $pep <= 10000 ) {
			$file = "10000";
		}
		elsif ( $pep <= 20000 ) {
			$file = "20000";
		}
		else {
			$file = "big_pep";
		}
	}
	open( FILE, ">>$file" );
	print FILE $row2;
	close(FILE);
	$n_pep_decoy++;
}
@array_peptides = "";
pop(@array_peptides);
$e = 0;
chdir($dir_src);


$PROT_FILE->close();
close(NFILE);
print "END\n\n";

#sort peptides
chdir($dir_name);
opendir( DB, $dir_name );
@allfiles = readdir(DB);
closedir(DB);
for ( $i = 0 ; $i < @allfiles ; $i++ ) {
	if ( index( $allfiles[$i], "." ) < 0 ) {
		open( FILE, "<$allfiles[$i]" );
		$e = 0;
		while ( !eof FILE ) {
			$array_peptides[$e] = <FILE>;
			@split_pep = split( "\t", $array_peptides[$e] );
			$hash_array{$e} = $split_pep[2];
			$e++;
		}
		close(FILE);
		@sorted = sort { $hash_array{$a} <=> $hash_array{$b} } keys %hash_array;
		open( FILE, ">$allfiles[$i]" );
		for ( $j = 0 ; $j < @sorted ; $j++ ) {
			$row = $array_peptides[ $sorted[$j] ];
			print FILE $row;
			delete $hash_array{$j};
		}
		close(FILE);
		@array_peptides = 0;
		pop(@array_peptides);
		@sorted = 0;
		pop(@sorted);
	}
}

#update the file in tmp folder
chdir($dir_tmp);
#number of random peptide in the composite database
open(N_DEC,">>decoy_pep");
print N_DEC "$nome_db\t$n_pep_decoy\n";
close(N_DEC);
