Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test modification of intermediate structural representations #3

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/codespell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ jobs:
- name: Codespell
uses: codespell-project/actions-codespell@v2
with:
skip: ./.git,*.pdb,./tests/data
skip: ./.git,*.pdb,./tests/data,*.ipynb
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ __pycache__
*.egg-info/
*.swp
.DS_Store

data/*
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ dynamic = ["version"]
[project.scripts]
calc-pr = "metfish.commands:get_Pr_cli"
extract-seq = "metfish.commands:extract_seq_cli"
prep-conformer-pairs = "metfish.commands:prep_conformer_pairs_cli"

[tool.ruff]
# Exclude a variety of commonly ignored directories.
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ pandas
numpy
scipy
periodictable
seaborn
matplotlib
alphafold-colabfold
1 change: 0 additions & 1 deletion scripts/PDB70_NERSC/PDBtoSeq.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import argparse
import os

def extract_seq(pdb_input,output_path):
pdb_name=pdb_input.split(".")[0]
Expand Down
1,165 changes: 1,165 additions & 0 deletions scripts/plot_conformer_pair_outcomes.ipynb

Large diffs are not rendered by default.

28 changes: 25 additions & 3 deletions src/metfish/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import pandas as pd

from .utils import get_Pr
from .utils import extract_seq
from .utils import get_Pr, extract_seq
from .preprocess import prep_conformer_pairs

def get_Pr_cli(argv=None):

Expand Down Expand Up @@ -41,7 +41,7 @@ def get_Pr_cli(argv=None):
step=args.step)

pd.DataFrame({"r": r, "P(r)": p}).to_csv(out, index=False)

def extract_seq_cli(argv=None):

parser = argparse.ArgumentParser(
Expand All @@ -54,3 +54,25 @@ def extract_seq_cli(argv=None):
args = parser.parse_args()

extract_seq(args.filename,args.output)

def prep_conformer_pairs_cli(argv=None):

desc = "Preprocess conformer pair pdb files to generate fasta sequences, atom positions in AF structure format"
epi = """Requires folder with the csv of apo/holo pairs and pdbs from Saldano et al., 2022"""

parser = argparse.ArgumentParser(description=desc, epilog=epi)
parser.add_argument("data_dir", help="the directory containing the pdb files / csv")
parser.add_argument("-o", "--output_dir", type=str, default=None,
help="the directory to save output files to. Defaults to data_dir if not provided")
parser.add_argument("-n", "--n_pairs", type=int, default=6,
help="the # of pairs to look at, N pairs -> 2N structures predicted")
parser.add_argument("-a", "--af_output_dir", type=str, default=None,
help="path with alphafold outputs, used to calculate which conformer is less similar to the original AF prediction.",
)

args = parser.parse_args()

prep_conformer_pairs(args.data_dir,
output_dir=args.output_dir,
n_pairs=args.n_pairs,
af_output_dir=args.af_output_dir,)
75 changes: 75 additions & 0 deletions src/metfish/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas as pd
import pickle
import glob
import shutil

from metfish.utils import get_alphafold_atom_positions, get_rmsd, convert_pdb_to_sequence, save_clean_pdb
from pathlib import Path


def prep_conformer_pairs(data_dir, output_dir=None, n_pairs=6, af_output_dir=None):
"""Preprocess pdb files to generate fasta sequences, atom positions in AF structure format.

Requires folder with the csv of apo/holo pairs and pdbs from Saldano et al., 2022

Args:
data_dir (str): the directory containing the pdb files
output_dir (str, optional): the directory to save output files to. Defaults to data_dir if not provided
n_pairs (int, optional): the # of pairs to look at, N pairs -> 2N structures predicted. Defaults to 6.
af_output_dir (str, optional): path with alphafold outputs,
used to calculate which conformer is less similar to the original AF prediction. Defaults to None.
"""
# load apo and holo id names
output_dir = output_dir or data_dir
pairs_df = pd.read_csv(f"{data_dir}/apo_holo_pairs.csv")
if n_pairs < pairs_df.shape[0]:
pairs_df = pairs_df.sample(n_pairs, random_state=1234) # selects random rows as subsample

# convert pdb files into sequences, AF structures to use as AF inputs
holo_id = pairs_df["holo_id"].to_list()
apo_id = pairs_df["apo_id"].to_list()
for name in [*holo_id, *apo_id]:
# clean pdbs (extract only ATOM coordinates)
raw_pdb = f"{data_dir}/{name}.pdb"
clean_pdb = f"{output_dir}/pdbs/{name}_atom_only.pdb"
Path(f"{output_dir}/pdbs/").mkdir(parents=True, exist_ok=True)
save_clean_pdb(raw_pdb, clean_pdb)

# save pairs as fasta sequence files
seq = convert_pdb_to_sequence(f"{output_dir}/pdbs/{name}_atom_only.pdb")
seq = "\n".join([f">{name}", seq])
Path(f"{output_dir}/sequences/apo_and_holo/").mkdir(parents=True, exist_ok=True)
with open(f"{output_dir}/sequences/apo_and_holo/{name}.fasta", "w") as f:
f.write(seq)

# copy apo ids over to separate folder for AF input (AF only needs to run one of apo/holo pair bc same sequence)
Path(f"{output_dir}/sequences/apo_only/").mkdir(parents=True, exist_ok=True)
if name in apo_id:
shutil.copyfile(
f"{output_dir}/sequences/apo_and_holo/{name}.fasta", f"{output_dir}/sequences/apo_only/{name}.fasta"
)

# save pairs as alphafold structure representations
try:
struct = get_alphafold_atom_positions(f"{output_dir}/pdbs/{name}_atom_only.pdb")
Path(f"{output_dir}/af_structures/").mkdir(parents=True, exist_ok=True)
with open(f"{output_dir}/af_structures/{name}.pickle", "wb") as f:
pickle.dump(struct, file=f)
except ValueError as e:
print(f'Error with sequence {name} - {e}')

# calculate RMSD values between alphafold output and apo / holo conformers
if af_output_dir is not None:
rmsd_h, rmsd_a = list(), list()
for h, a in zip(holo_id, apo_id):
af_output = glob.glob(f"{af_output_dir}/{a}_unrelaxed_rank_001_*_000.pdb")[0]
rmsd_h.append(get_rmsd(f"{output_dir}/pdbs/{h}_atom_only.pdb", af_output))
rmsd_a.append(get_rmsd(f"{output_dir}/pdbs/{a}_atom_only.pdb", af_output))

pairs_df["rmsd_apo_af"] = rmsd_a
pairs_df["rmsd_holo_af"] = rmsd_h
pairs_df["less_similar_conformer"] = pairs_df.apply(
lambda x: x["holo_id"] if x["rmsd_apo_af"] < x["rmsd_holo_af"] else x["apo_id"], axis=1
)

pairs_df.to_csv(f"{output_dir}/apo_holo_pairs_with_similarity.csv", index=False)
115 changes: 115 additions & 0 deletions src/metfish/representation_manipulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import pickle
import numpy as np
import pandas as pd
import warnings

from pathlib import Path


def modify_representations(prev: dict = None, method: str = "none", **kwargs):
"""Modify the pair, position (structural), or MSA (first row only) representations
that will be used as inputs for the next recycling iteration of AlphaFold.

Args:
prev (dict): Dict of pair, position, and msa representations.
method (str): Method to use to modify representations.

Returns:
dict: modified pair, position and msa representations
"""

# apply modification method (test examples, will eventually modify prev outputs based on SAXS data)
match method:
case "none":
repr_modified = prev
case "reinitialize":
repr_modified = reinitialize(prev)
case "add_noise":
repr_modified = add_noise(prev)
case "replace_structure":
repr_modified = replace_structure(prev, **kwargs)
case _:
warnings.warn("Representation modification method not supported - defaulting to no modification")
repr_modified = prev

return repr_modified


def add_noise(prev):
"""Adds gaussian noise to the pair, structure, and MSA representations"""
prev_with_noise = dict()
rng = np.random.default_rng()

for key, value in prev.items():
noise = rng.standard_normal(value.shape).astype("float16") # gaussian noise with μ = 0, σ = 1
prev_with_noise[key] = value + noise

return prev_with_noise


def reinitialize(prev):
"""Reinitializes the pair, structure, and MSA, representations to zero arrays"""

L = np.shape(prev["prev_pair"])[0]

prev = {
"prev_msa_first_row": np.zeros([L, 256], dtype=np.float16),
"prev_pair": np.zeros([L, L, 128], dtype=np.float16),
"prev_pos": np.zeros([L, 37, 3], dtype=np.float16),
}

return prev


def replace_structure(prev, job_name, input_dir=None, replacement_method="template"):
"""Replace intermediate structure (atom position) representations from alphafold"""
# load in conformer pair information
input_dir = input_dir or Path(__file__).resolve().parents[2] / 'data'
conformer_pairs_fname = f"{input_dir}/apo_holo_pairs_with_similarity.csv"

pdb_name = job_name.split("_")[0]
conformer_df = pd.read_csv(conformer_pairs_fname)
pairs = list(zip(conformer_df["apo_id"], conformer_df["holo_id"]))

# get relevant pairs
index = [ind for ind, (a, h) in enumerate(pairs) if pdb_name in a or pdb_name in h]
pair_info = conformer_df.iloc[index, :]

# get structure name depending on replacement method
match replacement_method:
case "less_similar":
replacement_pdb_name = pair_info["less_similar_conformer"]
case "alternate":
replacement_pdb_name = (
pair_info["holo_id"] if pdb_name in pair_info["apo_id"].to_list()[0] else pair_info["apo_id"]
) # get opposite conformer
case "template":
replacement_pdb_name = (
pair_info["apo_id"] if pdb_name in pair_info["apo_id"].to_list()[0] else pair_info["holo_id"]
) # provide conformer experimental structure
case _:
replacement_pdb_name = []

if any(replacement_pdb_name):
# load replacement structure
print(f"Replacing {pdb_name} intermediate structure with {replacement_pdb_name.to_list()[0]}.")
with open(f"{input_dir}/af_structures/{replacement_pdb_name.to_list()[0]}.pickle", "rb") as f:
replacement_structure = pickle.load(f)

# NOTE - additional zero values seem to get added to the first dimension of the position array
# (n_res) when running multiple sequences. AF ignores anything longer than n_res when writing
# to a pdb file from the protein class, so replacing the first n_res values and adding a warning
if np.shape(prev["prev_pos"])[0] != np.shape(replacement_structure)[0]:
warnings.warn(
f"Alphafold intermediate {np.shape(prev['prev_pos'])} and modified conformer "
f"{np.shape(replacement_structure)} structures were not the same shape.",
)

# replace conformer with alternative option
n_res = np.shape(replacement_structure)[0]
prev["prev_pos"][:n_res, :, :] = replacement_structure.astype("float16")

else:
warnings.warn(f'No replacement option found for "{pdb_name}". Continuing without modification')

return prev
Loading
Loading