#!/usr/bin/perl

#/*******************************************************************************
# * Contributors:
# *    Susanna Zucca, Margherita Villaraggia - susanna.zucca(at)unipv.it
# *******************************************************************************/


#==============================================================================
#title           :trimming.pl
#description     :This script taks as input paired-end FastQ files and R1 and R2 primer lists (generated by print_primer_seq.pl) and provides as output R1.trim.fastq and R2.trim.fastq fastq files with trimmed reads. Primer_id.txt file is generated, required to remove reads with reads_remove.pl script
#authors         :Susanna Zucca, Margherita Villaraggia
#dependencies    :List::Util
#date            :20160216
#version         :1.0   
#usage		 :perl trimming.pl Sample_R1.fastq Sample_R2.fastq R1_primer.txt R2_primer.txt
#notes		 :R1_primer.txt and R2_primer.txt are generated by print_primer_seq.pl
#example	 :perl trimming.pl Example_R1.fastq Example_R2.fastq Example_manifest.txt_Primer_R1.txt Example_manifest.txt_Primer_R2.txt
#==============================================================================
use strict;
use warnings;
use List::Util qw(first max maxstr min minstr reduce shuffle sum);


my $num_args = $#ARGV + 1;
if ($num_args != 4) {
    print "\nUsage: trimming.pl \nFastQ_R1_file \nFastQ_R2_file \nR1_Primer_file.txt \nR2_Primer_file.txt\n";
    exit;
}


#==============================================================================
#IMPORT DATA AND INITIALIZE OUTPUT FILES
#==============================================================================

my $fQ1=$ARGV[0];
my $fQ2=$ARGV[1];
my $primer_file_R1=$ARGV[2];
my $primer_file_R2=$ARGV[3];

my $trim1_name=$fQ1.".trim.fastq";
my $trim2_name=$fQ2.".trim.fastq";

my $primer_id_name=$fQ1."_primer_ID.txt";


open (TRIM1, ">$trim1_name");
open (TRIM2, ">$trim2_name");
open (FASTQ1, "<$fQ1") or die "Can't open $fQ1 file!";
open (FASTQ2, "<$fQ2") or die "Can't open $fQ2 file!";
print "opening file $primer_file_R1...\n"; 
open (PRIMER, "<$primer_file_R1") or die "Can't open $primer_file_R1 file!";
print "opening file $primer_file_R2...\n";
open (PRIMER_IN, "<$primer_file_R2") or die "Can't open $primer_file_R2 file!";
open (ID, ">$primer_id_name");

my @fastq1=<FASTQ1>;
my @fastq2=<FASTQ2>;
my @primer=<PRIMER>;
my @primer_in=<PRIMER_IN>;


my $i=0;
my $li=@fastq1;
my $count=0;
my @listID1=();
my @IDsplit1=();
my @listR1=();
my @quality1=();
my @listID2=();
my @IDsplit2=();
my @listR2=();
my @quality2=();
my @segno1=();
my @segno2=();

#==============================================================================
#EXTRACT READS ID INFORMATION FROM FASTQ
#==============================================================================

for ($i=0;$i<$li/4;$i++){	

$count=4*$i;
$listID1[$i]=$fastq1[$count];
($IDsplit1[$i])=split(' ', $listID1[$i]);
$listID2[$i]=$fastq2[$count];
($IDsplit2[$i])=split(' ', $listID2[$i]);
$count++;
$listR1[$i]=$fastq1[$count];
$listR2[$i]=$fastq2[$count];
$count++;
$segno1[$i]=$fastq1[$count];
$segno2[$i]=$fastq2[$count];
$count++;
$quality1[$i]=$fastq1[$count];
$quality2[$i]=$fastq2[$count];

} 



shift @primer;	
my $p=0;
my $lp=@primer;
my @chr=();
my @start=();
my @end=();
my @ULSO=();
my @DLSO=();
my $conto=0;

for ($p=0;$p<$lp;$p++){	

($chr[$conto],$start[$conto],$end[$conto],$ULSO[$conto],$DLSO[$conto])=split('\t',$primer[$p]);
$conto++;

} 


shift @primer_in;	
my $pi=0;
my $lpi=@primer_in;
my @chr_in=();
my @start_in=();
my @end_in=();
my @ULSO_in=();
my @DLSO_in=();
my $conto_in=0;
my @chr_tot=();
my @start_tot=();
my @end_tot=();
my @sequence=();
my @strand=();
my @id=();

for ($pi=0;$pi<$lpi;$pi++){

($chr_in[$conto_in],$start_in[$conto_in],$end_in[$conto_in],$ULSO_in[$conto_in],$DLSO_in[$conto_in],$chr_tot[$conto_in],$start_tot[$conto_in],$end_tot[$conto_in],$sequence[$conto_in],$strand[$conto_in],$id[$conto_in])=split('\t',$primer_in[$pi]);
$conto_in++;

} 

#==============================================================================
#EXTRACT PRIMER INFORMATION FOR EACH READ (admitted one single nucleotide mismatch) and PRINT RESULTS
#==============================================================================

my $r=0;
my $lr=@listR1;
my $string_new;
my $len_p1;
my $len_p2;
my $quality_new1;
my $quality_new2;
my $id_primer;

for ($r=0;$r<$lr;$r++){


my $c=0;
my $lc=@DLSO;
my $found_primer=0;
my $found_primer2=0;
my $id_primer;

for ($c=0;$c<$lc;$c++){
$len_p1=length($DLSO[$c]);
$found_primer=0;
if ($listR1[$r] =~ /^$DLSO[$c]/){

$found_primer=1;
last;
} 
else{
my $mask = pack("A*",$DLSO[$c])^pack("A*",$listR1[$r]);
my $matches = $mask =~ tr/\x0/\x0/; 
if ($matches==($len_p1-1)){
$found_primer=1;
last;
}	
} 
} 

my $d=0;
if ($found_primer==1){


$found_primer2=0;
$len_p2=length($ULSO_in[$c]);
my $mask2 = pack("A*",$ULSO_in[$c])^pack("A*",$listR2[$r]);
my $matches2 = $mask2 =~ tr/\x0/\x0/; 

if ($listR2[$r] =~ /^$ULSO_in[$c]/){
$found_primer2=1;
$id_primer=$id[$c];

print TRIM1 "$listID1[$r]";
$string_new=substr($listR1[$r],$len_p1);
print TRIM1 "$string_new";
print TRIM1 "$segno1[$r]";
$quality_new1=substr($quality1[$r],$len_p1);
print TRIM1 "$quality_new1";
print ID "$id_primer"; 
print TRIM2 "$listID2[$r]";
$string_new=substr($listR2[$r],$len_p2);
print TRIM2 "$string_new";
print TRIM2 "$segno2[$r]";
$quality_new2=substr($quality2[$r],$len_p2);
print TRIM2 "$quality_new2";
next;	

}

elsif($matches2==($len_p2-1)){

$found_primer2=1;
$id_primer=$id[$c];

print TRIM1 "$listID1[$r]";
$string_new=substr($listR1[$r],$len_p1);
print TRIM1 "$string_new";
print TRIM1 "$segno1[$r]";
$quality_new1=substr($quality1[$r],$len_p1);
print TRIM1 "$quality_new1";
print ID "$id_primer"; 
print TRIM2 "$listID2[$r]";
$string_new=substr($listR2[$r],$len_p2);
print TRIM2 "$string_new";
print TRIM2 "$segno2[$r]";
$quality_new2=substr($quality2[$r],$len_p2);
print TRIM2 "$quality_new2";
next;	
}	
else{	
my $d=0;
my $ld=@ULSO_in;

for ($d=0;$d<$ld;$d++){
$found_primer2=0;	
if ($listR2[$r] =~ /^$ULSO_in[$d]/){
$found_primer2=1;
last;	
}
else { 
my $mask2 = pack("A*",$ULSO_in[$d])^pack("A*",$listR2[$r]);
my $matches2 = $mask2 =~ tr/\x0/\x0/; 

if ($matches2==($len_p2-1)){

$found_primer2=1;
last;	
}	
} 
} 

if ($found_primer2==1){
$found_primer=0;
$len_p1=length($DLSO[$d]);
my $mask2 = pack("A*",$DLSO[$d])^pack("A*",$listR1[$r]);
my $matches2 = $mask2 =~ tr/\x0/\x0/; 

if ($listR1[$r] =~ /^$DLSO[$d]/){
$found_primer=1;
$id_primer=$id[$d];
print "Exact match of primers in reads R1 and R2\n";
print TRIM1 "$listID1[$r]";
$string_new=substr($listR1[$r],$len_p1);
print TRIM1 "$string_new";
print TRIM1 "$segno1[$r]";
$quality_new1=substr($quality1[$r],$len_p1);
print TRIM1 "$quality_new1";
print ID "$id_primer"; 
print TRIM2 "$listID2[$r]";
$string_new=substr($listR2[$r],$len_p2);
print TRIM2 "$string_new";
print TRIM2 "$segno2[$r]";
$quality_new2=substr($quality2[$r],$len_p2);
print TRIM2 "$quality_new2";
next;	
}
if($matches2==($len_p1-1)){
$found_primer=1;
$id_primer=$id[$d];
print "Exact match for R2, one mismatch for R1\n";
print TRIM1 "$listID1[$r]";
$string_new=substr($listR1[$r],$len_p1);
print TRIM1 "$string_new";
print TRIM1 "$segno1[$r]";
$quality_new1=substr($quality1[$r],$len_p1);
print TRIM1 "$quality_new1";
print ID "$id_primer"; 
print TRIM2 "$listID2[$r]";
$string_new=substr($listR2[$r],$len_p2);
print TRIM2 "$string_new";
print TRIM2 "$segno2[$r]";
$quality_new2=substr($quality2[$r],$len_p2);
print TRIM2 "$quality_new2";
next;	
}	
}	
}	
} 
} 




