screenforbio-mbc/protax_classify_otus.sh at master · alexcrampton-platt/screenforbio-mbc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash

set -e
set -u
set -o pipefail

# Send STDOUT and STDERR to log file
exec > >(tee -a protax_classify_otus.`date +%Y-%m-%d`.log)
exec 2> >(tee -a protax_classify_otus.`date +%Y-%m-%d`.log >&2)

##### INFO

# protax_classify_otus.sh

# for classifying OTUs using a pre-trained unweighted protax model and cleaned reference database

# written by Alex Crampton-Platt for Andreas Wilting (IZW, ScreenForBio project)

# usage: bash protax_classify_otus.sh otus locus protaxdir screenforbio outdir
# where:
# otus is the (path to) the OTU fasta to be processed (suffix should be ".fa")
# locus is the target locus, must be one of: 12S, 16S, CYTB, COI. if you have more than one locus to analyse, run script once for each.
# protaxdir is the path to a directory containing protax models and clean databases for all 4 loci
# screenforbio is the path to the screenforbio-mbc directory (must contain subdirectory protaxscripts)
# outdir is the name to give an output directory (inside current)

##### WELCOME
if [ "$#" -ne 5 ]
then
	echo ""
	echo "You are trying to use protax_classify_otus.sh but have not provided enough information."
	echo "Please check the following:"
	echo ""
	echo "usage: bash protax_classify_otus.sh otus locus protaxdir screenforbio utdir"
	echo "where:"
	echo "otus is the (path to) the OTU fasta to be processed (suffix should be ".fa")"
  echo "locus is the target locus, must be one of: 12S, 16S, CYTB, COI. if you have more than one locus to analyse, run script once for each."
  echo "protaxdir is the path to a directory containing protax models and clean databases for all 4 loci (one subdirectory per locus) plus required scripts"
	echo "screenforbio is the path to the screenforbio-mbc directory (must contain subdirectory protaxscripts)"
	echo "outdir is the name to give an output directory (inside current)"
	echo ""
	exit 1
else
	echo "Welcome to protax_classify_otus.sh: classification of OTUs with an unweighted PROTAX model"
	echo ""
	echo "The time now is $(date)."
	echo "Information from you:"
	echo "FASTA file of OTUs is ${1}"
  echo "Protax models and reference databases are in ${3}"
	echo "PROTAX Perl scripts are in ${4}/protaxscripts"
	echo "Output directory will be ${5}"
	echo ""
fi

##### PARAMETERS
DATA=${1}
LOCUS=${2}
MODELS=${3}
SCRIPTS=${4}
OUTDIR=${4}
DATE=`date +%Y-%m-%d`

##### MAIN
echo "Step 1: Get names of input file"
echo ""
echo "File names is:"
file_name=($(basename ${DATA} ".fa"))
printf '%s\n' "${file_name[@]}"
echo ""

echo "Step 2: Make output directory"
echo ""
mkdir ./${OUTDIR}_${LOCUS}
echo "Output will be in: ./${OUTDIR}_${LOCUS}"
echo ""

echo "Step 3: Run protax"
echo ""
start=`date +%s`
#run LAST search
echo "	Running LAST search..."
lastal -T 1 -a 1 -f 0 -m 1000 ${MODELS}/model_${LOCUS}/lastref_${LOCUS} ${DATA} > ./${OUTDIR}_${LOCUS}/${file_name}.last
#convert LAST result to similarity
echo "	Convert LAST result to sequence similarity..."
perl ${SCRIPTS}/protaxscripts/last2sim.pl ./${OUTDIR}_${LOCUS}/${file_name}.last > ./${OUTDIR}_${LOCUS}/${file_name}.lastsim
#clean up
rm ./${OUTDIR}_${LOCUS}/${file_name}.last
#get IDs
echo "	Get sequence IDs..."
grep '^>' ${DATA} | cut -c2- > ./${OUTDIR}_${LOCUS}/${file_name}.ids
#generate base log probability file
echo "	Generate base log probability file..."
perl ${SCRIPTS}/protaxscripts/testsample2init.pl ./${OUTDIR}_${LOCUS}/${file_name}.ids > ./${OUTDIR}_${LOCUS}/${file_name}.0.logprob
#classify sequences at each taxonomic level (order, family, genus, species)
echo "	Classifying at..."
for LEVEL in 1 2 3 4
do
  echo "		LEVEL $LEVEL"
  PREVLEVEL=$((LEVEL-1))
  IFILE=./${OUTDIR}_${LOCUS}/${file_name}.${PREVLEVEL}.logprob
  OFILE=./${OUTDIR}_${LOCUS}/${file_name}.${LEVEL}.logprob
  perl ${SCRIPTS}/protaxscripts/classify4.pl $IFILE ${MODELS}/model_${LOCUS}/tax$LEVEL ${MODELS}/model_${LOCUS}/ref.tax$LEVEL ${MODELS}/model_${LOCUS}/rseqs$LEVEL ${MODELS}/model_${LOCUS}/mcmc$LEVEL map ./${OUTDIR}_${LOCUS}/${file_name}.lastsim 0 .05 $OFILE 1
done
#add taxonomic info
echo "	Adding taxonomy..."
perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${file_name}.0.logprob > ./${OUTDIR}_${LOCUS}/${file_name}.class_probs
perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${file_name}.1.logprob > ./${OUTDIR}_${LOCUS}/${file_name}.order_probs
perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${file_name}.2.logprob > ./${OUTDIR}_${LOCUS}/${file_name}.family_probs
perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${file_name}.3.logprob > ./${OUTDIR}_${LOCUS}/${file_name}.genus_probs
perl ${SCRIPTS}/protaxscripts/add_taxonomy_info.pl ${MODELS}/model_${LOCUS}/taxonomy ./${OUTDIR}_${LOCUS}/${file_name}.4.logprob > ./${OUTDIR}_${LOCUS}/${file_name}.species_probs
#add sequence similarity of assigned species/genus (for unassigned OTUs takes best matching sequence)
for id in $(cut -f1 ./${OUTDIR}_${LOCUS}/${file_name}.species_probs | sort -u)
do
	sp=$(grep -w ${id} ./${OUTDIR}_${LOCUS}/${file_name}.species_probs | cut -f5 | awk 'BEGIN{FS=","}{print $3 "_" $4}')
	if grep -q "_unk$" <(echo ${sp})
	then
		gen=$(sed 's/_unk//' <(echo ${sp}))
		#find gen
		if grep -q -w ${id} ./${OUTDIR}_${LOCUS}/${file_name}.lastsim
		then
			grep -w ${id} ./${OUTDIR}_${LOCUS}/${file_name}.lastsim | grep ${gen} | awk -v max=0 '{if($3>max){want=$0;max=$3}}END{print want}' >> ./${OUTDIR}_${LOCUS}/${file_name}.bestsim
		else
			continue
		fi
	else
		#find sp (for unassigned OTUs will pick up best matching sequence)
		if grep -q -w ${id} ./${OUTDIR}_${LOCUS}/${file_name}.lastsim
		then
			grep -w ${id} ./${OUTDIR}_${LOCUS}/${file_name}.lastsim | grep ${sp} | awk -v max=0 '{if($3>max){want=$0;max=$3}}END{print want}' >> ./${OUTDIR}_${LOCUS}/${file_name}.bestsim
		else
			continue
		fi
	fi
done
join -j 1 -a 1 -o 1.1,1.2,1.3,1.4,1.5,2.3,2.2 <(sort -k1,1 ./${OUTDIR}_${LOCUS}/${file_name}.species_probs) <(sort -k1,1 ./${OUTDIR}_${LOCUS}/${file_name}.bestsim) > ./${OUTDIR}_${LOCUS}/${file_name}.species_probs_sim
echo ""

##### END
echo "protax_classify_otus.sh complete."
end=`date +%s`
runtime=$((end-start))
echo "This took a total of `echo $runtime | awk '{printf "%.2f", $1/60}'` minutes."
echo ""
echo "Results are in ./${OUTDIR}_${LOCUS}"
echo "Classification for each OTU at each taxonomic level (species, genus, family, order) in files ${file_name}.level_probs"
echo "Headers are:"
echo "	queryID	taxID	log(probability)	level	taxon"
echo "Additionally, the best matching hit (for assigned species/genus where available) found with LAST is appended to ${file_name}.species_probs in ${file_name}.species_probs_sim"
echo "Headers are:"
echo "	queryID	taxID	log(probability)	level	taxon	bestHit_similarity	bestHit"
echo ""
echo "Have a nice day :-)"
echo ""