-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnvio_Workflow
240 lines (209 loc) · 15.6 KB
/
Anvio_Workflow
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
When this was started Anvi'o was at version 3. It was rerun with version 5.5 to confirm it works.
#-------------------------------------------------Creating database (anvio now V5.5)-----------------------------------------------------------------------------#
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.1/
#by using the cut off of 1,000 instead of 300bp we end up with ~800k Rather than 5-6million contigs when I used 300bp
#using a cut off of 2,500 you get 124k contigs and a cut of of 5,000 gives you 24k contigs
time anvi-script-reformat-fasta -o fixed.contigsV5_1000.fa -l 1000 --simplify-names --report-file contigs_1000.reformatA.txt ../../MEGAHIT/C5_Results/final.contigs.fa
#echo "Done Reformating Fasta 1000"
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
#Building database
anvi-gen-contigs-database -f ../C5_V5.1/fixed.contigs.fa -o fixed.contigsV5.5.db -n 'CDRF_Metagenome'
#Checking Stats
anvi-display-contigs-stats fixed.contigsV5.5.db --report-as-text --output contig_stats.txt
#----------------------------------------------Functional Annotation with Pfam Hmms-----------------------------------------------------------------------------#
#!/bin/bash
##
#SBATCH --mem=50G
#SBATCH --time=7-18:0:0
#SBATCH -p production
#SBATCH -n 20
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/C5_PfamsV5.5_1k_running.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/C5_PfamsV5.5_1k_running.err
source activate Anvio5.5
#Annotating genes in contigs database with functions from the Pfams.
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
#This continues to give errors when downloading
#time anvi-setup-pfams --reset --pfam-data-dir /share/magalab/bin/anaconda3/envs/Anvio5.5/lib/python3.6/site-packages/anvio/data/misc/Pfams/
#echo "done setting up Pfams directory"
time anvi-run-pfams -c fixed.contigsV5.5.db -T 20 --pfam-data-dir /share/magalab/bin/anaconda3/envs/Anvio5.5/lib/python3.6/site-packages/anvio/data/misc/Pfams/
echo "done running up Pfams"
#----------------------------------------------Functional Annotation with custom sets of Hmms-----------------------------------------------------------------------------#
#-----------------------------Nitrogen Fixation Gene Hmms-----------------#
#!/bin/bash
##
#SBATCH --mem=80G
#SBATCH --time=12-18:0:0
#SBATCH -p production
#SBATCH -n 20
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/NF_HmmV5.5.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/NF_HmmV5.5.err
source activate Anvio5.5
#needs more mem than 50G
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
time anvi-run-hmms -T 20 -c fixed.contigsV5.5.db --hmm-profile-dir ../Nif_Hmms/
echo "Done HMMER Nitrogen"
#-----------------------------Nitrogen Cycle Gene Hmms-----------------#
#!/bin/bash
##
#SBATCH --mem=80G
#SBATCH --time=12-18:0:0
#SBATCH -p production
#SBATCH -n 20
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/Nitro_HmmV5.5.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/Nitro_HmmV5.5.err
source activate Anvio5.5
#needs more mem than 50G
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
time anvi-run-hmms -T 20 -c fixed.contigsV5.5.db --hmm-profile-dir ../Nitro_Hmms/
echo "Done HMMER Nitrogen"
#-----------------------------FOAM Nitrogen Cycle Hmms-----------------#
##Running hmm from Foam collection
#Tried this and this ran for 30days without completion ... decided to subset out the nitrogen related genes and run that
anvi-run-hmms -T 70 -c /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V4/fixed.contigsV4.db --hmm-profile-dir /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/FOAM_2018/
#Will need to run Foam_to_anvio.py to get files formated properly --> see FOAM_2018 readme.md for formating directions
##Running Nitrogen cycle hmm from subset of Foam collection
#!/bin/bash
##
#SBATCH --mem=80G
#SBATCH --time=30-18:0:0
#SBATCH -p production
#SBATCH -n 40
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/Foam_Nitro_HmmV5.5.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/Foam_Nitro_HmmV5.5.err
source activate Anvio5.5
#needs more mem than 50G
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
time anvi-run-hmms -T 40 -c fixed.contigsV5.5.db --hmm-profile-dir /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/FOAM_2018/Foam_Nitro/
echo "Done HMMER Foam"
#-------------------------------------------Functional Annotation with NCBI Cogs-----------------------------------------------------------------------------#
#!/bin/bash
##
#SBATCH --mem=50G
#SBATCH --time=7-18:0:0
#SBATCH -p production
#SBATCH -n 20
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/C5_CogsV5.5_1k_running.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/C5_CogsV5.5_1k_running.err
source activate Anvio5.5
#Annotating genes in contigs database with functions from the NCBI’s Clusters of Orthologus Groups.
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
time anvi-setup-ncbi-cogs -T 20 --just-do-it --reset --cog-data-dir /share/magalab/bin/anaconda3/envs/Anvio5.5/lib/python3.6/site-packages/anvio/data/misc/COG/
echo "done setting up cog directory"
time anvi-run-ncbi-cogs -c fixed.contigsV5.5.db -T 20 --sensitive --cog-data-dir /share/magalab/bin/anaconda3/envs/Anvio5.5/lib/python3.6/site-packages/anvio/data/misc/COG/
echo "done running up cogs"
#----------------------------------------------Functional Annotation with Prokka-----------------------------------------------------------------------------#
#I did try using prokka for gene calls and then bringing those gene calls into the anvio database to beginning with, but got a lot
#less gene calls, see note below. In the end, I abandoned this method.
#!/bin/bash
##
#SBATCH --mem=50G
#SBATCH --time=1-18:0:0
#SBATCH -n 5
#SBATCH -p production
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.2/Error_Out_Files/C5_1k_prokka_DBV5.2_Anvio.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.2/Error_Out_Files/C5_1k_prokka_DBV5.2_Anvio.err
aklog
source activate anvio5.2
time anvi-gen-contigs-database -f ../C5_V5.1/fixed.contigsV5_1000.fa -o fixed.contigsV5_1000_prokka.db --external-gene-calls /share/magalab/bin/Prokka_to_Anvi/gene_calls.txt --ignore-internal-stop-codons --project-name 'CDRF_Metagenome_2016'
#Then import the functional annotations:
time anvi-import-functions -c fixed.contigsV5_1000_prokka.db -i /share/magalab/bin/Prokka_to_Anvi/gene_annot.txt
##note:
#You might have noticed that Prokka finds a lot less genes in your metagenome than the standard anvi’o pipeline.
#This is because of the hard-coded Prodigal option -c in Prokka, which only calls full-length genes (it probably is there for a reason, so proceed with caution).
#But for metagenomes, you might except to have many partial gene calls, and might want to include them in your analyses.
#To achieve that, you can hack the Prokka script and remove the -c option from the Prodigal command.
#----------------------------------------Functional Annotation with GhostKOALA-----------------------------------------------------------------------------#
#----------------------------------------Functional Annotation with GhostKOALA-----------------------------------------------------------------------------#
See readme in folder on functional annotation --> GhostKOALA
#-------------------------------------------Making sam/bam files to make anvi profiles-----------------------------------------------------------------------#
#Bash script
for f in /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/*.output.pe.kak.qc.fq.gz
do
fname=$(basename $f .output.pe.kak.qc.fq.gz)
echo $fname
echo "#!/bin/bash" > $fname.C5.MB.Anvi.sh
echo "##" >> $fname.C5.MB.Anvi.sh
echo "#SBATCH --mem=10G" >> $fname.C5.MB.Anvi.sh
echo "#SBATCH --time=2-18:0:0" >> $fname.C5.MB.Anvi.sh
echo "#SBATCH -p gc,gc64,gc128,animal_sciences" >> $fname.C5.MB.Anvi.sh
echo "#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Error_Out_Files/${fname}_sam_making_bam.out" >> $fname.C5.MB.Anvi.sh
echo "#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Error_Out_Files/${fname}_sam_making_bam.err" >> $fname.C5.MB.Anvi.sh
echo "module load bowtie2/2.2.8" >> $fname.C5.MB.Anvi.sh
echo "module load samtools/1.8" >> $fname.C5.MB.Anvi.sh
echo "module load khmer/2.0-rc1" >> $fname.C5.MB.Anvi.sh
echo "#time split-paired-reads.py -f -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/ -1 /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.1.pe.kak.qc.fq.gz -2 /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.2.pe.kak.qc.fq.gz /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/${fname}.output.pe.kak.qc.fq.gz" >> $fname.C5.MB.Anvi.sh
echo "#echo "done split-paired-reads"" >> $fname.C5.MB.Anvi.sh
echo "#echo "Note that running split-paired-reads.py will unzip file. Bowtie2 will error out if .gz is end of name without file being zipped."" >> $fname.C5.MB.Anvi.sh
echo "#time mv /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.1.pe.kak.qc.fq.gz /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.1.pe.kak.qc.fastq" >> $fname.C5.MB.Anvi.sh
echo "#time mv /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.2.pe.kak.qc.fq.gz /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.2.pe.kak.qc.fastq" >> $fname.C5.MB.Anvi.sh
echo "#echo "Done renaming file"" >> $fname.C5.MB.Anvi.sh
echo "#time bowtie2 --threads 8 -x /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/fixed.contigs -1 /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.1.pe.kak.qc.fastq -2 /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/Split/${fname}.output.2.pe.kak.qc.fastq -U /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Normalization/${fname}.output.se.kak.qc.fq.gz -S /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/${fname}.sam" >> $fname.C5.MB.Anvi.sh
echo "#echo "done bowtie2 both"" >> $fname.C5.MB.Anvi.sh
echo "time samtools view -F 4 -bS /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/${fname}.sam > /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/${fname}-RAW.bam" >> $fname.C5.MB.Anvi.sh
echo "echo "done samtools view both"" >> $fname.C5.MB.Anvi.sh
echo "time samtools sort -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/${fname}.sorted.bam /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/${fname}-RAW.bam" >> $fname.C5.MB.Anvi.sh
echo "echo "done samtools sort both"" >> $fname.C5.MB.Anvi.sh
sbatch $fname.C5.MB.Anvi.sh
done
###For whatever reason Anvio didn't like how this was sorted so I resorted them via anvio.
#-------------------------------------------Sorting Bam files with Anvio-----------------------------------------------------------------------#
for f in /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/*-RAW.bam
do
fname=$(basename $f -RAW.bam)
echo $fname
echo "#!/bin/bash" > $fname.AnviBam.sh
echo "##" >> $fname.AnviBam.sh
echo "#SBATCH -p production" >> $fname.AnviBam.sh
echo "#SBATCH --mem=10G" >> $fname.AnviBam.sh
echo "#SBATCH --time=2-18:0:0" >> $fname.AnviBam.sh
echo "#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/Error_Out_Files/${fname}_Bam.out" >> $fname.AnviBam.sh
echo "#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/Error_Out_Files/${fname}_Bam.err" >> $fname.AnviBam.sh
echo "cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_Mapping/" >> $fname.AnviBam.sh
echo "time "anvi-init-bam ${fname}-RAW.bam -o ${fname}.bam >> $fname.AnviBam.sh
sbatch $fname.AnviBam.sh
done
#-------------------------------------------Making Anvio Profiles for Each Sample-----------------------------------------------------------------------#
#NOTE: anivo doesn't like sample names to start with numbers so I renamed the samples
#rename 's/(\d+)-(\d+)_S(\d+)/Farm$1-S$2/' *.bam , rename 's/(\d+)-(\d+)_S(\d+)/Farm$1-S$2/' *.bam.bai
#These took 1-3 days to make
for f in /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.1/1000_Mapping/Renamed/*.bam
do
fname=$(basename $f .bam)
echo $fname
echo "#!/bin/bash" > $fname.anvi_profile.sh
echo "##" >> $fname.anvi_profile.sh
echo "#SBATCH --mem=80G" >> $fname.anvi_profile.sh
echo "#SBATCH --time=3-18:0:0" >> $fname.anvi_profile.sh
echo "#SBATCH -n 10" >> $fname.anvi_profile.sh
echo "#SBATCH -p production" >> $fname.anvi_profile.sh
echo "#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/${fname}_anvi_profile.out" >> $fname.anvi_profile.sh
echo "#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/${fname}_anvi_profile.err" >> $fname.anvi_profile.sh
echo "aklog" >> $fname.anvi_profile.sh
echo "source activate Anvio5.5" >> $fname.anvi_profile.sh
echo "cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.1/1000_Mapping/Renamed/" >> $fname.anvi_profile.sh
echo "time anvi-profile -T 20 -i ./Renamed/${fname}.bam -c ../../C5_V5.5/fixed.contigsV5.5db --list-contigs" >> $fname.anvi_profile.sh
echo "echo "Done listing contigs"" >> $fname.anvi_profile.sh
echo "time anvi-profile -T 20 --min-contig-length 1000 --min-coverage-for-variability 5 -i ${fname}.bam -c ../../C5_V5.5/fixed.contigsV5.5.db -o /tmp/Anvio_Profile_Temp.btGg1/${fname}/ --sample-name ${fname} --overwrite-output-destinations" >> $fname.anvi_profile.sh
echo "echo "done profiles sort both"" >> $fname.anvi_profile.sh
echo "time cp -r /tmp/Anvio_Profile_Temp.btGg1/${fname}/ ../../C5_V5.5/Profiles/" >> $fname.anvi_profile.sh
sbatch $fname.anvi_profile.sh
done
#-------------------------------------------Merging Anvio Profiles for Each Sample-----------------------------------------------------------------------#
#!/bin/bash
##
#SBATCH --mem=50G
#SBATCH --time=20-18:0:0
#SBATCH -n 5
#SBATCH -p production
#SBATCH -o /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/Merge_all_Anvio.out
#SBATCH -e /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Error_Out_Files/Merge_all_Anvio.err
aklog
source activate Anvio5.5
mkdir /tmp/Anvio_Profile_Temp.btGg1/
cd /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/
cp -r ./Profiles/ /tmp/Anvio_Profile_Temp.btGg1/
cd /tmp/Anvio_Profile_Temp.btGg1/Profiles/
time anvi-merge ./Farm6_S7/PROFILE.db ./Farm6_S5/PROFILE.db ./Farm6_S1/PROFILE.db ./Farm5_S11/PROFILE.db ./Farm5_S6/PROFILE.db ./Farm5_S14/PROFILE.db ./Farm1_S1/PROFILE.db ./Farm1_S10/PROFILE.db ./Farm1_S12/PROFILE.db ./Farm8_S10/PROFILE.db ./Farm8_S9/PROFILE.db ./Farm8_S8/PROFILE.db -o ./All-MERGED/ -c /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/fixed.contigsV5.5.db --overwrite-output-destinations
echo "Done merging all farms"
#on rafter-0 and rafter-16
cp -r /tmp/Anvio_Profile_Temp.btGg1/Profiles/All-MERGED/ /share/tearlab/Maga/Jill/CDRF_MetaGenome/Assembly_2018/Anvio/C5_V5.5/Profiles/