#!/usr/bin/perl

#/*******************************************************************************
# * Contributors:
# *    Susanna Zucca, Margherita Villaraggia - susanna.zucca(at)unipv.it
# *******************************************************************************/


#==============================================================================
#title           :reads_remove.pl
#description     :This script taks as input trimmed paired-end FastQ files, vcf file (generated by any variant caller) and provides as output R1.reads_remove.fastq and R2.reads_remove.fastq fastq files with removed all reads generated by a primer pairing with a region containing a mutation reported in vcf file.
#authors         :Susanna Zucca, Margherita Villaraggia
#dependencies    :List::Util
#date            :20160216
#version         :1.0   
#usage		 :perl trimming.pl Sample_R1.fastq Sample_R2.fastq R1_primer.txt R2_primer.txt
#notes		 :R1_primer.txt and R2_primer.txt are generated by print_primer_seq.pl
#example	 :perl reads_remove.pl Example_R1.fastq.trim.fastq Example_R2.fastq.trim.fastq Example.vcf Example_R1.fastq_primer_ID.txt Example_manifest.txt_Primer_R2.txt
#==============================================================================
use strict;
use warnings;
use Spreadsheet::BasicRead;
use List::Util qw(first);




my $num_args = $#ARGV + 1;
if ($num_args != 5) {
    print "\nUsage: reads_remove.pl \nTrimmed_FastQ_R1_file \nTrimmed_FastQ_R2_file\n vcf file\n Primer_ID.txt file \nR2_Primer_file.txt\n";
    exit;
}

#==============================================================================
#IMPORT DATA AND INITIALIZE OUTPUT FILES
#==============================================================================

my $file_read1=$ARGV[0];
my $file_read2=$ARGV[1];
my $vcf_file = $ARGV[2];
my $file_name_ID_primer=$ARGV[3];
my $file_primer=$ARGV[4];

my $output1=$ARGV[0].".read_remove.fastq";
my $output2=$ARGV[1]."_read_remove.fastq";
my $tmp_tmp="tmp.txt";


open (TEMP, ">$tmp_tmp");  
open (PRIMER, "$file_primer");	
open (READ1, "$file_read1");
open (READ2, "$file_read2");
open (VCF, "$vcf_file");
open (ID_PRIMER, "$file_name_ID_primer");
open (FASTQ1NEW, ">$output1"); 
open (FASTQ2NEW, ">$output2"); 

my @primer=<PRIMER>;
my @read1=<READ1>;
my @read2=<READ2>;
my @id_primer=<ID_PRIMER>;
my @Vcf = <VCF>;

my @vcf = grep{!/^#/} @Vcf;

shift(@primer);
chomp (@primer);


my $p=0;
my $lp=@primer;
my @chr_p=();   
my @start_p1=(); 
my @end_p1=();   
my @start_p2=();  
my @end_p2=();     
my @tarID=();

my $ch;
my $st;
my $en;
my $ULSO;
my $DLSO;
my $chr_tot;
my $start_tot;
my $end_tot;
my $seq;
my $strand;
my $targetID;

#==============================================================================
#LOOK FOR PRIMERS MATCHING A MUTATED REGION
#==============================================================================

for ($p=0;$p<$lp;$p++){ 
($ch,$st,$en,$ULSO,$DLSO,$chr_tot,$start_tot,$end_tot,$seq,$strand,$targetID)=split('\t',$primer[$p]);
$chr_p[$p]=$chr_tot;
$start_p1[$p]=$start_tot;
$end_p2[$p]=$end_tot;
$tarID[$p]=$targetID;
my $lung_ULSO=length($ULSO);
my $lung_DLSO=length($DLSO);

if ($strand eq '+'){
$end_p1[$p]=$start_tot+$lung_ULSO-1;
$start_p2[$p]=$end_tot-$lung_DLSO+1;
}  
else {
$end_p1[$p]=$start_tot+$lung_DLSO-1;
$start_p2[$p]=$end_tot-$lung_ULSO+1;
}   
}  


my $v=0;
my $lv=@vcf;
my $Chr;
my $chr;
my $start;
my $end;
my $id_read;
my $ref;
my $alt;
my $c=0;
my $lc=@primer;
my $conta_elim=0;
my @index_for_elimination=();
my $index=0;

for ($v=0;$v<$lv;$v++){	

($Chr, $start, $id_read, $ref, $alt)=split('\t',$vcf[$v]);
$end=$start+length($alt)-length($ref);
$chr="chr".$Chr;

for ($c=0;$c<$lc;$c++){ 

if ((($chr eq $chr_p[$c]) && ($start>=$start_p1[$c]) && ($end<=$end_p1[$c])) || (($chr eq $chr_p[$c]) && ($start>=$start_p2[$c]) && ($end<=$end_p2[$c]))){

print TEMP "Primer Matching a mutated region\n";
print TEMP "Mutation: $chr $start $end\n";
print TEMP "Primer: $chr_p[$c] $start_p1[$c] $end_p1[$c] $start_p2[$c] $end_p2[$c] $tarID[$c]\n";

my $pr=0;
my $lpr=@id_primer;

for ($pr=0;$pr<$lpr;$pr++){ 
chomp ($tarID[$c]);
chomp ($id_primer[$pr]);
#==============================================================================
#LOOK FOR READS GENERATED BY PRIMERS MATCHING MUTATED REGIONS
#==============================================================================
if ($tarID[$c] eq $id_primer[$pr]){

print TEMP "Primer $tarID[$c] generated read $id_primer[$pr]\n";
push (@index_for_elimination,$pr);   
}  
}  
}  
}  
}

my @index_for_elimination_sort=sort { $a <=> $b } @index_for_elimination;
print TEMP "Index of the reads to be removed: @index_for_elimination_sort\n";
#==============================================================================
#PRINT SURVIVED READS IN NEW FASTQ FILES
#==============================================================================

my $index_count=0;
my $re=0;
my $lre=@read1;
my $pr=0;
my $lpr=@index_for_elimination_sort;
my $flag=0;
my $count1=0;


for ($re=0;$re<$lre/4;$re++){
$count1=$re*4;
$flag=0;

for ($pr=0;$pr<$lpr;$pr++){ 
if ($index_count==$index_for_elimination_sort[$pr]){
$flag=1;
}   
}  

if ($flag==0){

    print FASTQ1NEW "$read1[$count1]";
    print FASTQ1NEW "$read1[$count1+1]";
    print FASTQ1NEW "$read1[$count1+2]";
    print FASTQ1NEW "$read1[$count1+3]";
    print FASTQ2NEW "$read2[$count1]";
    print FASTQ2NEW "$read2[$count1+1]";
    print FASTQ2NEW "$read2[$count1+2]";
    print FASTQ2NEW "$read2[$count1+3]";
    
}   

$index_count++;

}  







