lbl-cbg · stephprince · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -18,4 +18,4 @@ jobs:
       - name: Codespell
         uses: codespell-project/actions-codespell@v2
         with:
-          skip: ./.git,*.pdb,./tests/data
+          skip: ./.git,*.pdb,./tests/data,*.ipynb
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ __pycache__
 *.egg-info/
 *.swp
 .DS_Store
+
+data/*
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ dynamic = ["version"]
 [project.scripts]
 calc-pr = "metfish.commands:get_Pr_cli"
 extract-seq = "metfish.commands:extract_seq_cli"
+prep-conformer-pairs = "metfish.commands:prep_conformer_pairs_cli"
 
 [tool.ruff]
 # Exclude a variety of commonly ignored directories.

diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,6 @@ pandas
 numpy
 scipy
 periodictable
+seaborn
+matplotlib
+alphafold-colabfold
diff --git a/scripts/PDB70_NERSC/PDBtoSeq.py b/scripts/PDB70_NERSC/PDBtoSeq.py
@@ -1,7 +1,6 @@
 from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
 import argparse
-import os
 
 def extract_seq(pdb_input,output_path):
     pdb_name=pdb_input.split(".")[0]

diff --git a/scripts/plot_conformer_pair_outcomes.ipynb b/scripts/plot_conformer_pair_outcomes.ipynb
diff --git a/src/metfish/commands.py b/src/metfish/commands.py
@@ -4,8 +4,8 @@
 
 import pandas as pd
 
-from .utils import get_Pr
-from .utils import extract_seq
+from .utils import get_Pr, extract_seq
+from .preprocess import prep_conformer_pairs
 
 def get_Pr_cli(argv=None):
 
@@ -41,7 +41,7 @@ def get_Pr_cli(argv=None):
                   step=args.step)
 
     pd.DataFrame({"r": r, "P(r)": p}).to_csv(out, index=False)
-
+    
 def extract_seq_cli(argv=None):
 
     parser = argparse.ArgumentParser(
@@ -54,3 +54,25 @@ def extract_seq_cli(argv=None):
     args = parser.parse_args()
 
     extract_seq(args.filename,args.output)
+
+def prep_conformer_pairs_cli(argv=None):
+
+    desc = "Preprocess conformer pair pdb files to generate fasta sequences, atom positions in AF structure format"
+    epi = """Requires folder with the csv of apo/holo pairs and pdbs from Saldano et al., 2022"""
+
+    parser = argparse.ArgumentParser(description=desc, epilog=epi)
+    parser.add_argument("data_dir", help="the directory containing the pdb files / csv")
+    parser.add_argument("-o", "--output_dir", type=str, default=None, 
+                        help="the directory to save output files to. Defaults to data_dir if not provided")
+    parser.add_argument("-n", "--n_pairs", type=int, default=6, 
+                        help="the # of pairs to look at, N pairs -> 2N structures predicted")
+    parser.add_argument("-a", "--af_output_dir", type=str, default=None,
+        help="path with alphafold outputs, used to calculate which conformer is less similar to the original AF prediction.",
+    )
+
+    args = parser.parse_args()
+
+    prep_conformer_pairs(args.data_dir,
+                         output_dir=args.output_dir,
+                         n_pairs=args.n_pairs,
+                         af_output_dir=args.af_output_dir,)
diff --git a/src/metfish/preprocess.py b/src/metfish/preprocess.py
@@ -0,0 +1,75 @@
+import pandas as pd
+import pickle
+import glob
+import shutil
+
+from metfish.utils import get_alphafold_atom_positions, get_rmsd, convert_pdb_to_sequence, save_clean_pdb
+from pathlib import Path
+
+
+def prep_conformer_pairs(data_dir, output_dir=None, n_pairs=6, af_output_dir=None):
+    """Preprocess pdb files to generate fasta sequences, atom positions in AF structure format.
+
+    Requires folder with the csv of apo/holo pairs and pdbs from Saldano et al., 2022
+
+    Args:
+        data_dir (str): the directory containing the pdb files
+        output_dir (str, optional): the directory to save output files to. Defaults to data_dir if not provided
+        n_pairs (int, optional): the # of pairs to look at, N pairs -> 2N structures predicted. Defaults to 6.
+        af_output_dir (str, optional): path with alphafold outputs, 
+        used to calculate which conformer is less similar to the original AF prediction. Defaults to None.
+    """
+    # load apo and holo id names
+    output_dir = output_dir or data_dir
+    pairs_df = pd.read_csv(f"{data_dir}/apo_holo_pairs.csv")
+    if n_pairs < pairs_df.shape[0]:
+        pairs_df = pairs_df.sample(n_pairs, random_state=1234)  # selects random rows as subsample
+
+    # convert pdb files into sequences, AF structures to use as AF inputs
+    holo_id = pairs_df["holo_id"].to_list()
+    apo_id = pairs_df["apo_id"].to_list()
+    for name in [*holo_id, *apo_id]:
+        # clean pdbs (extract only ATOM coordinates)
+        raw_pdb = f"{data_dir}/{name}.pdb"
+        clean_pdb = f"{output_dir}/pdbs/{name}_atom_only.pdb"
+        Path(f"{output_dir}/pdbs/").mkdir(parents=True, exist_ok=True)
+        save_clean_pdb(raw_pdb, clean_pdb)
+
+        # save pairs as fasta sequence files
+        seq = convert_pdb_to_sequence(f"{output_dir}/pdbs/{name}_atom_only.pdb")
+        seq = "\n".join([f">{name}", seq])
+        Path(f"{output_dir}/sequences/apo_and_holo/").mkdir(parents=True, exist_ok=True)
+        with open(f"{output_dir}/sequences/apo_and_holo/{name}.fasta", "w") as f:
+            f.write(seq)
+
+        # copy apo ids over to separate folder for AF input (AF only needs to run one of apo/holo pair bc same sequence)
+        Path(f"{output_dir}/sequences/apo_only/").mkdir(parents=True, exist_ok=True)
+        if name in apo_id:
+            shutil.copyfile(
+                f"{output_dir}/sequences/apo_and_holo/{name}.fasta", f"{output_dir}/sequences/apo_only/{name}.fasta"
+            )
+
+        # save pairs as alphafold structure representations
+        try:
+            struct = get_alphafold_atom_positions(f"{output_dir}/pdbs/{name}_atom_only.pdb")
+            Path(f"{output_dir}/af_structures/").mkdir(parents=True, exist_ok=True)
+            with open(f"{output_dir}/af_structures/{name}.pickle", "wb") as f:
+                pickle.dump(struct, file=f)
+        except ValueError as e:
+            print(f'Error with sequence {name} - {e}')
+
+    # calculate RMSD values between alphafold output and apo / holo conformers
+    if af_output_dir is not None:
+        rmsd_h, rmsd_a = list(), list()
+        for h, a in zip(holo_id, apo_id):
+            af_output = glob.glob(f"{af_output_dir}/{a}_unrelaxed_rank_001_*_000.pdb")[0]
+            rmsd_h.append(get_rmsd(f"{output_dir}/pdbs/{h}_atom_only.pdb", af_output))
+            rmsd_a.append(get_rmsd(f"{output_dir}/pdbs/{a}_atom_only.pdb", af_output))
+
+        pairs_df["rmsd_apo_af"] = rmsd_a
+        pairs_df["rmsd_holo_af"] = rmsd_h
+        pairs_df["less_similar_conformer"] = pairs_df.apply(
+            lambda x: x["holo_id"] if x["rmsd_apo_af"] < x["rmsd_holo_af"] else x["apo_id"], axis=1
+        )
+
+        pairs_df.to_csv(f"{output_dir}/apo_holo_pairs_with_similarity.csv", index=False)
diff --git a/src/metfish/representation_manipulation.py b/src/metfish/representation_manipulation.py
@@ -0,0 +1,115 @@
+import pickle
+import numpy as np
+import pandas as pd
+import warnings
+
+from pathlib import Path
+
+
+def modify_representations(prev: dict = None, method: str = "none", **kwargs):
+    """Modify the pair, position (structural), or MSA (first row only) representations
+    that will be used as inputs for the next recycling iteration of AlphaFold.
+
+    Args:
+        prev (dict): Dict of pair, position, and msa representations.
+        method (str): Method to use to modify representations.
+
+    Returns:
+        dict: modified pair, position and msa representations
+    """
+
+    # apply modification method (test examples, will eventually modify prev outputs based on SAXS data)
+    match method:
+        case "none":
+            repr_modified = prev
+        case "reinitialize":
+            repr_modified = reinitialize(prev)
+        case "add_noise":
+            repr_modified = add_noise(prev)
+        case "replace_structure":
+            repr_modified = replace_structure(prev, **kwargs)
+        case _:
+            warnings.warn("Representation modification method not supported - defaulting to no modification")
+            repr_modified = prev
+
+    return repr_modified
+
+
+def add_noise(prev):
+    """Adds gaussian noise to the pair, structure, and MSA representations"""
+    prev_with_noise = dict()
+    rng = np.random.default_rng()
+
+    for key, value in prev.items():
+        noise = rng.standard_normal(value.shape).astype("float16")  # gaussian noise with μ = 0, σ = 1
+        prev_with_noise[key] = value + noise
+
+    return prev_with_noise
+
+
+def reinitialize(prev):
+    """Reinitializes the pair, structure, and MSA, representations to zero arrays"""
+
+    L = np.shape(prev["prev_pair"])[0]
+
+    prev = {
+        "prev_msa_first_row": np.zeros([L, 256], dtype=np.float16),
+        "prev_pair": np.zeros([L, L, 128], dtype=np.float16),
+        "prev_pos": np.zeros([L, 37, 3], dtype=np.float16),
+    }
+
+    return prev
+
+
+def replace_structure(prev, job_name, input_dir=None, replacement_method="template"):
+    """Replace intermediate structure (atom position) representations from alphafold"""
+    # load in conformer pair information
+    input_dir = input_dir or Path(__file__).resolve().parents[2] / 'data'
+    conformer_pairs_fname = f"{input_dir}/apo_holo_pairs_with_similarity.csv"
+
+    pdb_name = job_name.split("_")[0]
+    conformer_df = pd.read_csv(conformer_pairs_fname)
+    pairs = list(zip(conformer_df["apo_id"], conformer_df["holo_id"]))
+
+    # get relevant pairs
+    index = [ind for ind, (a, h) in enumerate(pairs) if pdb_name in a or pdb_name in h]
+    pair_info = conformer_df.iloc[index, :]
+
+    # get structure name depending on replacement method
+    match replacement_method:
+        case "less_similar":
+            replacement_pdb_name = pair_info["less_similar_conformer"]
+        case "alternate":
+            replacement_pdb_name = (
+                pair_info["holo_id"] if pdb_name in pair_info["apo_id"].to_list()[0] else pair_info["apo_id"]
+            )  # get opposite conformer
+        case "template":
+            replacement_pdb_name = (
+                pair_info["apo_id"] if pdb_name in pair_info["apo_id"].to_list()[0] else pair_info["holo_id"]
+            )  # provide conformer experimental structure
+        case _:
+            replacement_pdb_name = []
+
+    if any(replacement_pdb_name):
+        # load replacement structure
+        print(f"Replacing {pdb_name} intermediate structure with {replacement_pdb_name.to_list()[0]}.")
+        with open(f"{input_dir}/af_structures/{replacement_pdb_name.to_list()[0]}.pickle", "rb") as f:
+            replacement_structure = pickle.load(f)
+
+        # NOTE - additional zero values seem to get added to the first dimension of the position array
+        # (n_res) when running multiple sequences. AF ignores anything longer than n_res when writing
+        # to a pdb file from the protein class, so replacing the first n_res values and adding a warning
+        if np.shape(prev["prev_pos"])[0] != np.shape(replacement_structure)[0]:
+            warnings.warn(
+                f"Alphafold intermediate {np.shape(prev['prev_pos'])} and modified conformer "
+                f"{np.shape(replacement_structure)} structures were not the same shape.",
+            )
+
+        # replace conformer with alternative option
+        n_res = np.shape(replacement_structure)[0]
+        prev["prev_pos"][:n_res, :, :] = replacement_structure.astype("float16")
+
+    else:
+        warnings.warn(f'No replacement option found for "{pdb_name}". Continuing without modification')
+
+    return prev
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,5 @@ __pycache__ @@
     *.egg-info/
     *.swp
     .DS_Store
+    data/*