#!/usr/bin/perl #/******************************************************************************* # * Contributors: # * Susanna Zucca, Margherita Villaraggia - susanna.zucca(at)unipv.it # *******************************************************************************/ #============================================================================== #title :reads_remove.pl #description :This script taks as input trimmed paired-end FastQ files, vcf file (generated by any variant caller) and provides as output R1.reads_remove.fastq and R2.reads_remove.fastq fastq files with removed all reads generated by a primer pairing with a region containing a mutation reported in vcf file. #authors :Susanna Zucca, Margherita Villaraggia #dependencies :List::Util #date :20160216 #version :1.0 #usage :perl trimming.pl Sample_R1.fastq Sample_R2.fastq R1_primer.txt R2_primer.txt #notes :R1_primer.txt and R2_primer.txt are generated by print_primer_seq.pl #example :perl reads_remove.pl Example_R1.fastq.trim.fastq Example_R2.fastq.trim.fastq Example.vcf Example_R1.fastq_primer_ID.txt Example_manifest.txt_Primer_R2.txt #============================================================================== use strict; use warnings; use Spreadsheet::BasicRead; use List::Util qw(first); my $num_args = $#ARGV + 1; if ($num_args != 5) { print "\nUsage: reads_remove.pl \nTrimmed_FastQ_R1_file \nTrimmed_FastQ_R2_file\n vcf file\n Primer_ID.txt file \nR2_Primer_file.txt\n"; exit; } #============================================================================== #IMPORT DATA AND INITIALIZE OUTPUT FILES #============================================================================== my $file_read1=$ARGV[0]; my $file_read2=$ARGV[1]; my $vcf_file = $ARGV[2]; my $file_name_ID_primer=$ARGV[3]; my $file_primer=$ARGV[4]; my $output1=$ARGV[0].".read_remove.fastq"; my $output2=$ARGV[1]."_read_remove.fastq"; my $tmp_tmp="tmp.txt"; open (TEMP, ">$tmp_tmp"); open (PRIMER, "$file_primer"); open (READ1, "$file_read1"); open (READ2, "$file_read2"); open (VCF, "$vcf_file"); open (ID_PRIMER, "$file_name_ID_primer"); open (FASTQ1NEW, ">$output1"); open (FASTQ2NEW, ">$output2"); my @primer=; my @read1=; my @read2=; my @id_primer=; my @Vcf = ; my @vcf = grep{!/^#/} @Vcf; shift(@primer); chomp (@primer); my $p=0; my $lp=@primer; my @chr_p=(); my @start_p1=(); my @end_p1=(); my @start_p2=(); my @end_p2=(); my @tarID=(); my $ch; my $st; my $en; my $ULSO; my $DLSO; my $chr_tot; my $start_tot; my $end_tot; my $seq; my $strand; my $targetID; #============================================================================== #LOOK FOR PRIMERS MATCHING A MUTATED REGION #============================================================================== for ($p=0;$p<$lp;$p++){ ($ch,$st,$en,$ULSO,$DLSO,$chr_tot,$start_tot,$end_tot,$seq,$strand,$targetID)=split('\t',$primer[$p]); $chr_p[$p]=$chr_tot; $start_p1[$p]=$start_tot; $end_p2[$p]=$end_tot; $tarID[$p]=$targetID; my $lung_ULSO=length($ULSO); my $lung_DLSO=length($DLSO); if ($strand eq '+'){ $end_p1[$p]=$start_tot+$lung_ULSO-1; $start_p2[$p]=$end_tot-$lung_DLSO+1; } else { $end_p1[$p]=$start_tot+$lung_DLSO-1; $start_p2[$p]=$end_tot-$lung_ULSO+1; } } my $v=0; my $lv=@vcf; my $Chr; my $chr; my $start; my $end; my $id_read; my $ref; my $alt; my $c=0; my $lc=@primer; my $conta_elim=0; my @index_for_elimination=(); my $index=0; for ($v=0;$v<$lv;$v++){ ($Chr, $start, $id_read, $ref, $alt)=split('\t',$vcf[$v]); $end=$start+length($alt)-length($ref); $chr="chr".$Chr; for ($c=0;$c<$lc;$c++){ if ((($chr eq $chr_p[$c]) && ($start>=$start_p1[$c]) && ($end<=$end_p1[$c])) || (($chr eq $chr_p[$c]) && ($start>=$start_p2[$c]) && ($end<=$end_p2[$c]))){ print TEMP "Primer Matching a mutated region\n"; print TEMP "Mutation: $chr $start $end\n"; print TEMP "Primer: $chr_p[$c] $start_p1[$c] $end_p1[$c] $start_p2[$c] $end_p2[$c] $tarID[$c]\n"; my $pr=0; my $lpr=@id_primer; for ($pr=0;$pr<$lpr;$pr++){ chomp ($tarID[$c]); chomp ($id_primer[$pr]); #============================================================================== #LOOK FOR READS GENERATED BY PRIMERS MATCHING MUTATED REGIONS #============================================================================== if ($tarID[$c] eq $id_primer[$pr]){ print TEMP "Primer $tarID[$c] generated read $id_primer[$pr]\n"; push (@index_for_elimination,$pr); } } } } } my @index_for_elimination_sort=sort { $a <=> $b } @index_for_elimination; print TEMP "Index of the reads to be removed: @index_for_elimination_sort\n"; #============================================================================== #PRINT SURVIVED READS IN NEW FASTQ FILES #============================================================================== my $index_count=0; my $re=0; my $lre=@read1; my $pr=0; my $lpr=@index_for_elimination_sort; my $flag=0; my $count1=0; for ($re=0;$re<$lre/4;$re++){ $count1=$re*4; $flag=0; for ($pr=0;$pr<$lpr;$pr++){ if ($index_count==$index_for_elimination_sort[$pr]){ $flag=1; } } if ($flag==0){ print FASTQ1NEW "$read1[$count1]"; print FASTQ1NEW "$read1[$count1+1]"; print FASTQ1NEW "$read1[$count1+2]"; print FASTQ1NEW "$read1[$count1+3]"; print FASTQ2NEW "$read2[$count1]"; print FASTQ2NEW "$read2[$count1+1]"; print FASTQ2NEW "$read2[$count1+2]"; print FASTQ2NEW "$read2[$count1+3]"; } $index_count++; }