Source code for graphein.protein.features.nodes.geometry

"""Provides geometry-based featurisation functions."""
# Graphein
# Author: Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import logging

import networkx as nx
import numpy as np
import pandas as pd

from graphein.protein.utils import filter_dataframe


[docs]def add_sidechain_vector(
    g: nx.Graph, scale: bool = True, reverse: bool = False
):
    """Adds vector from node to average position of sidechain atoms.

    We compute the mean of the sidechain atoms for each node. For this we use the ``rgroup_df`` dataframe.
    If the graph does not contain the ``rgroup_df`` dataframe, we compute it from the ``raw_pdb_df``.
    If scale, we scale the vector to the unit vector. If reverse is True,
    we reverse the vector (``sidechain - node``). If reverse is false (default) we compute (``node - sidechain``).

    :param g: Graph to add vector to.
    :type g: nx.Graph
    :param scale: Scale vector to unit vector. Defaults to ``True``.
    :type scale: bool
    :param reverse: Reverse vector. Defaults to ``False``.
    :type reverse: bool
    """
    # Get or compute R-Group DF
    if "rgroup_df" not in g.graph.keys():
        g.graph["rgroup_df"] = compute_rgroup_dataframe(g.graph["raw_pdb_df"])

    sc_centroid = g.graph["rgroup_df"].groupby("node_id").mean()

    # Iterate over nodes and compute vector
    for n, d in g.nodes(data=True):
        if d["residue_name"] == "GLY":
            # If GLY, set vector to 0
            vec = np.array([0, 0, 0])
        else:
            if reverse:
                vec = d["coords"] - np.array(
                    sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]]
                )
            else:
                vec = (
                    np.array(
                        sc_centroid.loc[n][["x_coord", "y_coord", "z_coord"]]
                    )
                    - d["coords"]
                )

            if scale:
                vec = vec / np.linalg.norm(vec)

        d["sidechain_vector"] = vec


[docs]def add_beta_carbon_vector(
    g: nx.Graph, scale: bool = True, reverse: bool = False
):
    """Adds vector from node (typically alpha carbon) to position of beta carbon.

    Glycine does not have a beta carbon, so we set it to ``np.array([0, 0, 0])``.
    We extract the position of the beta carbon from the unprocessed atomic PDB dataframe.
    For this we use the ``raw_pdb_df`` dataframe.
    If scale, we scale the vector to the unit vector. If reverse is True,
    we reverse the vector (``C beta - node``). If reverse is false (default) we compute (``node - C beta``).

    :param g: Graph to add vector to.
    :type g: nx.Graph
    :param scale: Scale vector to unit vector. Defaults to ``True``.
    :type scale: bool
    :param reverse: Reverse vector. Defaults to ``False``.
    :type reverse: bool
    """
    # Get or compute R-Group DF
    if "rgroup_df" not in g.graph.keys():
        g.graph["rgroup_df"] = compute_rgroup_dataframe(g.graph["raw_pdb_df"])

    c_beta_coords = filter_dataframe(
        g.graph["rgroup_df"], "atom_name", ["CB"], boolean=True
    )
    c_beta_coords.index = c_beta_coords["node_id"]

    # Iterate over nodes and compute vector
    for n, d in g.nodes(data=True):
        if d["residue_name"] == "GLY":
            vec = np.array([0, 0, 0])
        else:
            if reverse:
                vec = d["coords"] - np.array(
                    c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]
                )
            else:
                vec = (
                    np.array(
                        c_beta_coords.loc[n][["x_coord", "y_coord", "z_coord"]]
                    )
                    - d["coords"]
                )

            if scale:
                vec = vec / np.linalg.norm(vec)
        d["c_beta_vector"] = vec


[docs]def add_sequence_neighbour_vector(
    g: nx.Graph, scale: bool = True, reverse: bool = False, n_to_c: bool = True
):
    """Computes vector from node to adjacent node in sequence.
    Typically used with ``CA`` (alpha carbon) graphs.

    If ``n_to_c`` is ``True`` (default), we compute the vectors from the N terminus to the C terminus (canonical direction).
    If ``reverse`` is ``False`` (default), we compute ``Node_i - Node_{i+1}``.
    If ``reverse is ``True``, we compute ``Node_{i+1} - Node_i``.
    :param g: Graph to add vector to.
    :type g: nx.Graph
    :param scale: Scale vector to unit vector. Defaults to ``True``.
    :type scale: bool
    :param reverse: Reverse vector. Defaults to ``False``.
    :type reverse: bool
    :param n_to_c: Compute vector from N to C or C to N. Defaults to ``True``.
    :type n_to_c: bool
    """
    suffix = "n_to_c" if n_to_c else "c_to_n"
    # Iterate over every chain
    for chain_id in g.graph["chain_ids"]:

        # Find chain residues
        chain_residues = [
            (n, v) for n, v in g.nodes(data=True) if v["chain_id"] == chain_id
        ]

        if not n_to_c:
            chain_residues.reverse()

        # Iterate over every residue in chain
        for i, residue in enumerate(chain_residues):
            # Checks not at chain terminus - is this versatile enough?
            if i == len(chain_residues) - 1:
                residue[1][f"sequence_neighbour_vector_{suffix}"] = np.array(
                    [0, 0, 0]
                )
                continue
            # Asserts residues are on the same chain
            cond_1 = (
                residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"]
            )
            # Asserts residue numbers are adjacent
            cond_2 = (
                abs(
                    residue[1]["residue_number"]
                    - chain_residues[i + 1][1]["residue_number"]
                )
                == 1
            )

            # If this checks out, we compute the vector
            if (cond_1) and (cond_2):
                vec = chain_residues[i + 1][1]["coords"] - residue[1]["coords"]

                if reverse:
                    vec = -vec
                if scale:
                    vec = vec / np.linalg.norm(vec)

            residue[1][f"sequence_neighbour_vector_{suffix}"] = vec