Axtner_2019_GigaScience/protax_classify.sh at master · EcoDynIZW/Axtner_2019_GigaScience · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/bin/bash

set -e
set -u
set -o pipefail

# Send STDOUT and STDERR to log file
exec > >(tee -a protax_classify.`date +%Y-%m-%d`.log)
exec 2> >(tee -a protax_classify.`date +%Y-%m-%d`.log >&2)

##### INFO

# protax_classify.sh

# for classifying batches of pre-preprocessed reads using a pre-trained unweighted protax model and cleaned reference database

# written by Alex Crampton-Platt for Andreas Wilting (IZW, ScreenForBio project)

# usage: bash protax_classify.sh pathToData locus protaxdir screenforbio outdir
# where:
# pathToData is the path to a directory containing FASTQ files to be processed one by one
# locus is the target locus, must be one of: 12S, 16S, CYTB, COI. if you have more than one locus per run, run script once for each.
# protaxdir is the path to a directory containing protax models and clean databases for all 4 loci
# screenforbio is the path to the screenforbio-mbc directory (must contain subdirectory protaxscripts)
# outdir is the name to give an output directory (inside current)

##### WELCOME
if [ "$#" -ne 5 ]
then
	echo ""
	echo "You are trying to use protax_classify.sh but have not provided enough information."
	echo "Please check the following:"
	echo ""
	echo "usage: bash protax_classify.sh pathToData locus protaxdir outdir"
	echo "where:"
	echo "pathToData is the path to a directory containing FASTQ files to be processed one by one"
  echo "locus is the target locus, must be one of: 12S, 16S, CYTB, COI. if you have more than one locus per run, run script once for each."
  echo "protaxdir is the path to a directory containing protax models and clean databases for all 4 loci (one subdirectory per locus) plus required scripts"
	echo "screenforbio is the path to the screenforbio-mbc directory (must contain subdirectory protaxscripts)"
	echo "outdir is the name to give an output directory (inside current)"
	echo ""
	exit 1
else
	echo "Welcome to protax_classify.sh: batch classification of FASTQ files with an unweighted PROTAX model"
	echo ""
	echo "The time now is $(date)."
	echo "Information from you:"
	echo "FASTQ files to be processed are in ${1}"
  echo "Protax models and reference databases are in ${3}"
	echo "PROTAX Perl scripts are in ${4}/protaxscripts"
	echo "Output directory will be ${5}"
	echo ""
fi

##### PARAMETERS
DATA=${1}
LOCUS=${2}
MODELS=${3}
SCRIPTS=${4}
OUTDIR=${5}
DATE=`date +%Y-%m-%d`

##### MAIN
echo "Step 1: Get names of input files"
echo ""
echo "File names are:"
file_name=($(find ${DATA} -mindepth 1 -maxdepth 1 -type f -name "*.${LOCUS}.filter.derep.fq"))
printf '%s\n' "${file_name[@]}"
echo ""
echo "There are ${#file_name[@]} files to process."
echo ""

echo "Step 2: Make output directory"
echo ""
mkdir ./${OUTDIR}_${LOCUS}
echo "Output will be in: ./${OUTDIR}_${LOCUS}"
echo ""

echo "Step 3: Run protax on each input file"
echo ""
echo "This may take a while..."
echo ""
start=`date +%s`
for file in ${file_name[@]}
do
	if [ -s ${file} ]
	then
		echo "Processing file: ${file}"
		starting=`date +%s`
		echo "	Starting at $(date)."
		#get sample name
		sample_name="$(basename "$file" .filter.derep.fq)"
	  #make output directory for sample
	  mkdir ./${OUTDIR}_${LOCUS}/${sample_name}/
	  #convert fastq to fasta
		echo "	Converting FASTQ to FASTA..."
	  usearch -fastq_filter ${file} -fastaout ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.fa
	  #run LAST search
		echo "	Running LAST search..."
	  lastal -T 1 -a 1 -f 0 -m 1000 ${MODELS}/model_${LOCUS}/lastref_${LOCUS} ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.fa > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.last
	  #convert LAST result to similarity
		echo "	Convert LAST result to sequence similarity..."
	  perl ${SCRIPTS}/protaxscripts/last2sim.pl ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.last > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.lastsim
	  #clean up
	  rm ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.last
	  #get IDs
		echo "	Get sequence IDs..."
	  grep '^>' ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.fa | cut -c2- > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.ids
	  #generate base log probability file
		echo "	Generate base log probability file..."
	  perl ${SCRIPTS}/protaxscripts/testsample2init.pl ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.ids > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.0.logprob
	  #classify sequences at each taxonomic level (order, family, genus, species)
		echo "	Classifying at..."
	  for LEVEL in 1 2 3 4
	  do
	    echo "		LEVEL $LEVEL"
	    PREVLEVEL=$((LEVEL-1))
	    IFILE=./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.${PREVLEVEL}.logprob
	    OFILE=./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.${LEVEL}.logprob
	    perl ${SCRIPTS}/protaxscripts/classify4.pl $IFILE ${MODELS}/model_${LOCUS}/tax$LEVEL ${MODELS}/model_${LOCUS}/ref.tax$LEVEL ${MODELS}/model_${LOCUS}/rseqs$LEVEL ${MODELS}/model_${LOCUS}/mcmc$LEVEL map ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.lastsim 0 .05 $OFILE 1
	  done
	  #add taxonomic info
		echo "	Adding taxonomy..."
	  perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.0.logprob > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.class_probs
	  perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.1.logprob > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.order_probs
	  perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.2.logprob > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.family_probs
	  perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.3.logprob > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.genus_probs
	  perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.4.logprob > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.species_probs
		#add sequence similarity of assigned species/genus (for unassigned reads takes best matching sequence)
		for id in $(cut -f1 ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.species_probs | sort -u)
		do
			sp=$(grep ${id} ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.species_probs | cut -f5 | awk 'BEGIN{FS=","}{print $3 "_" $4}')
			if grep -q "_unk$" <(echo ${sp})
			then
				gen=$(sed 's/_unk//' <(echo ${sp}))
				#find gen
				grep ${id} ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.lastsim | grep ${gen} | awk -v max=0 '{if($3>max){want=$0;max=$3}}END{print want}' >> ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.bestsim
			else
				#find sp (for unassigned reads will pick up best matching sequence)
				grep ${id} ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.lastsim | grep ${sp} | awk -v max=0 '{if($3>max){want=$0;max=$3}}END{print want}' >> ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.bestsim
			fi
		done
		join -j 1 -a 1 -o 1.1,1.2,1.3,1.4,1.5,2.3,2.2 <(sort -k1,1 ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.species_probs) <(sort -k1,1 ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.bestsim) > ./${OUTDIR}_${LOCUS}/${sample_name}/${sample_name}.species_probs_sim
		#finish iteration with message
		echo ""
		echo "	Finished at $(date)."
		ending=`date +%s`
		runningtime=$((ending-starting))
		echo "	Processing of ${file} took `echo $runningtime | awk '{printf "%.2f", $1/60}'` minutes"
		echo ""
	else
		echo "${file} empty, skipping..."
		echo ""
	fi
done
echo ""

##### END
echo "protax_classify complete for all input files"
end=`date +%s`
runtime=$((end-start))
echo "This took a total of `echo $runtime | awk '{printf "%.2f", $1/60}'` minutes (`echo $runtime | awk '{printf "%.2f", $1/3600}'` hours)."
echo ""
echo "Result files are in ./${OUTDIR}_${LOCUS}/sample_name"
echo "Classification for each read at each taxonomic level (species, genus, family, order) in files sample_name.level_probs"
echo "Headers are:"
echo "	queryID	taxID	log(probability)	level	taxon"
echo "Additionally, the best matching hit (for assigned species/genus where available) found with LAST is appended to sample_name.species_probs in sample_name.species_probs_sim"
echo "Headers are:"
echo "	queryID	taxID	log(probability)	level	taxon	bestHit_similarity	bestHit"
echo ""
echo "Have a nice day :-)"
echo ""