forked from google-deepmind/alphafold
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.sh
223 lines (177 loc) · 8.26 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/bin/bash
# Description: AlphaFold non-docker version
# Author: Sanjay Kumar Srikakulam
# by WTTAT
usage() {
echo ""
echo "Please make sure all required parameters are given"
echo "Usage: $0 <OPTIONS>"
echo "Required Parameters:"
echo "-f <fasta_paths> Path to a FASTA file containing sequence. If a FASTA file contains multiple sequences, then it will be folded as a multimer"
echo "-t <max_template_date> Maximum template release date to consider (ISO-8601 format - i.e. YYYY-MM-DD). Important if folding historical test sets"
echo "Optional Parameters:"
echo "-m <model_preset> Choose preset model configuration - the monomer model, the monomer model with extra ensembling, monomer model with pTM head, or multimer model (default: 'monomer')"
echo "-c <db_preset> Choose preset MSA database configuration - smaller genetic database config (reduced_dbs) or full genetic database config (full_dbs) (default: 'full_dbs')"
echo "-l <num_multimer_predictions_per_model> How many predictions (each with a different random seed) will be generated per model. E.g. if this is 2 and there are 5 models then there will be 10 predictions per input. Note: this FLAG only applies if model_preset=multimer (default: 5)"
echo "-p <use_precomputed_msas> Whether to read MSAs that have been written to disk. WARNING: This will not check if the sequence, database or configuration have changed (default: 'false')"
echo "-r <models_to_relax> AlphaFold can re-use MSAs (multiple sequence alignments) for the same sequence via --use_precomputed_msas=true option; this can be useful for trying different AlphaFold parameters. This option assumes that the directory structure generated by the first AlphaFold run in the output directory exists and that the protein sequence is the same."
echo "-b <benchmark> Run multiple JAX model evaluations to obtain a timing that excludes the compilation time, which should be more indicative of the time required for inferencing many proteins (default: 'false')"
exit 1
}
while getopts ":f:t:m:c:l:p:r:b" i; do
case "${i}" in
f)
fasta_paths=$OPTARG
;;
t)
max_template_date=$OPTARG
;;
m)
model_preset=$OPTARG
;;
c)
db_preset=$OPTARG
;;
l)
num_multimer_predictions_per_model=$OPTARG
;;
p)
use_precomputed_msas=$OPTARG
;;
r)
models_to_relax=$OPTARG
;;
b)
benchmark='false'
;;
esac
done
echo "BATCH_BUCKET : $BATCH_BUCKET"
echo "REGION : $REGION"
echo "fasta : $fasta"
echo "fasta_paths : $fasta_paths"
echo "max_template_date : $max_template_date"
echo "model_preset : $model_preset"
echo "db_preset : $db_preset"
echo "num_multimer_predictions_per_model : $num_multimer_predictions_per_model"
echo "use_precomputed_msas: $use_precomputed_msas"
echo "models_to_relax: $models_to_relax"
pwd
# Parse input and set defaults
if [[ "$fasta_paths" == "" || "$max_template_date" == "" ]] ; then
usage
fi
if [[ "$num_multimer_predictions_per_model" == "" ]] ; then
num_multimer_predictions_per_model=5
fi
if [[ "$model_preset" == "" ]] ; then
model_preset="monomer"
fi
if [[ "$model_preset" != "monomer" && "$model_preset" != "monomer_casp14" && "$model_preset" != "monomer_ptm" && "$model_preset" != "multimer" ]] ; then
echo "Unknown model preset! Using default ('monomer')"
model_preset="monomer"
fi
if [[ "$db_preset" == "" ]] ; then
db_preset="full_dbs"
fi
if [[ "$db_preset" != "full_dbs" && "$db_preset" != "reduced_dbs" ]] ; then
echo "Unknown database preset! Using default ('full_dbs')"
db_preset="full_dbs"
fi
if [[ "$use_precomputed_msas" == "" ]] ; then
use_precomputed_msas="false"
fi
if [[ "models_to_relax" == "" ]] ; then
models_to_relax="best"
fi
if [[ "$models_to_relax" != "best" && "$models_to_relax" != "all" && "$models_to_relax" != "none" ]] ; then
echo "Unknown models_to_relax! Using default ('best')"
models_to_relax="best"
fi
echo "model_preset reset: $model_preset"
echo "db_preset reset: $db_preset"
echo "num_multimer_predictions_per_model reset: $num_multimer_predictions_per_model"
# This bash script looks for the run_alphafold.py script in its current working directory, if it does not exist then exits
current_working_dir=$(pwd)
alphafold_script="$current_working_dir/run_alphafold.py"
# if [ ! -f "$alphafold_script" ]; then
# echo "Alphafold python script $alphafold_script does not exist."
# exit 1
# fi
# Export ENVIRONMENT variables and set CUDA devices for use
# CUDA GPU control
# export CUDA_VISIBLE_DEVICES=-1
# if [[ "$use_gpu" == true ]] ; then
# export CUDA_VISIBLE_DEVICES=0
# if [[ "$gpu_devices" ]] ; then
# export CUDA_VISIBLE_DEVICES=$gpu_devices
# fi
# fi
# OpenMM threads control
# if [[ "$openmm_threads" ]] ; then
# export OPENMM_CPU_THREADS=$openmm_threads
# fi
# This part set in batch env
# # TensorFlow control
# export TF_FORCE_UNIFIED_MEMORY='1'
# # JAX control
# export XLA_PYTHON_CLIENT_MEM_FRACTION='4.0'
# dataset in Fsx for lustre
data_dir="/fsx/dataset"
# Path and user config (change me if required)
uniref90_database_path="$data_dir/uniref90/uniref90.fasta"
uniprot_database_path="$data_dir/uniprot/uniprot.fasta"
mgnify_database_path="$data_dir/mgnify/mgy_clusters_2022_05.fa"
bfd_database_path="$data_dir/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"
small_bfd_database_path="$data_dir/small_bfd/bfd-first_non_consensus_sequences.fasta"
uniref30_database_path="$data_dir/uniref30/UniRef30_2021_03"
pdb70_database_path="$data_dir/pdb70/pdb70"
pdb_seqres_database_path="$data_dir/pdb_seqres/pdb_seqres.txt"
template_mmcif_dir="$data_dir/pdb_mmcif/mmcif_files"
obsolete_pdbs_path="$data_dir/pdb_mmcif/obsolete.dat"
# Binary path (change me if required)
hhblits_binary_path=$(which hhblits)
hhsearch_binary_path=$(which hhsearch)
jackhmmer_binary_path=$(which jackhmmer)
kalign_binary_path=$(which kalign)
# download fasta file from S3
echo "start downloading"
aws s3 cp $INPUT_S3_URI ./input.fasta --region $REGION
output_dir="/app/output/"
use_gpu_relax='true'
command_args="--fasta_paths=$fasta_paths --output_dir=$output_dir --max_template_date=$max_template_date --db_preset=$db_preset --model_preset=$model_preset --benchmark=$benchmark --use_precomputed_msas=$use_precomputed_msas --num_multimer_predictions_per_model=$num_multimer_predictions_per_model --use_gpu_relax=$use_gpu_relax --models_to_relax=$models_to_relax --logtostderr"
database_paths="--uniref90_database_path=$uniref90_database_path --mgnify_database_path=$mgnify_database_path --data_dir=$data_dir --template_mmcif_dir=$template_mmcif_dir --obsolete_pdbs_path=$obsolete_pdbs_path"
binary_paths="--hhblits_binary_path=$hhblits_binary_path --hhsearch_binary_path=$hhsearch_binary_path --jackhmmer_binary_path=$jackhmmer_binary_path --kalign_binary_path=$kalign_binary_path"
if [[ $model_preset == "multimer" ]]; then
database_paths="$database_paths --uniprot_database_path=$uniprot_database_path --pdb_seqres_database_path=$pdb_seqres_database_path"
else
database_paths="$database_paths --pdb70_database_path=$pdb70_database_path"
fi
if [[ "$db_preset" == "reduced_dbs" ]]; then
database_paths="$database_paths --small_bfd_database_path=$small_bfd_database_path"
else
database_paths="$database_paths --uniref30_database_path=$uniref30_database_path --bfd_database_path=$bfd_database_path"
fi
echo "command_args: $command_args"
echo "database_paths: $database_paths"
echo "binary_paths: $binary_paths"
# Run AlphaFold with required parameters
echo "start running af2"
$(python $alphafold_script $binary_paths $database_paths $command_args)
error_code=$?
echo $error_code
if [ $error_code -ne 0 ];then
echo "af2 failed"
exit $error_code
else
echo "af2 succeed"
fi
echo "start ziping"
result_folder=${fasta_paths%.*}
cd $output_dir
tar -zcvf output.tar.gz $result_folder/
echo "start uploading"
# aws s3 sync $output_dir/$result_folder s3://$BATCH_BUCKET/$OUTPUT_PREFIX/$fasta --region $REGION
# add metadata
aws s3 cp $output_dir/output.tar.gz $OUTPUT_S3_URI --metadata {'"id"':'"'$file_id'"'} --region $REGION
echo "all done"