Bio-Hack/valid_data_prediction.py at develop · jayaram0528/Bio-Hack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from xgboost import XGBRegressor
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from rdkit.Chem import Descriptors

xgb_model = XGBRegressor()
xgb_model.load_model("xgb_trained_model.json")

# function to construct Morgan fingerprints from SMILES

def smiles_to_morgan_fp(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return list(fp)


def compute_descriptors(smiles):
    """
    Compute molecular descriptors for a given SMILES string.

    Args:
        smiles (str): The SMILES representation of the molecule.

    Returns:
        list: A list containing [MolWt, LogP, TPSA, HBD, HBA] or [None]*5 if the SMILES is invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None]*5  # Adjust if you add more descriptors
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    tpsa = Descriptors.TPSA(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    return [mw, logp, tpsa, hbd, hba]


# define a function to predict the drug response
def predict_drug_response(csv_file_path):
    """
    Predict drug response based on the provided CSV file.

    Args:
        csv_file_path (str): The path to the CSV file containing drug data.

    Returns:
        pd.DataFrame: A DataFrame with predictions and original data.
    """
    # Read the CSV file
    data = pd.read_csv(csv_file_path)

    # Convert SMILES to Morgan fingerprints
    data['morgan_fp'] = data['Drug'].apply(smiles_to_morgan_fp)

    # Compute molecular descriptors
    descriptor_names = ['MolWt', 'LogP', 'TPSA', 'HBD', 'HBA']
    descriptor_values = data['Drug'].apply(compute_descriptors)

    # Convert to DataFrame using the computed descriptors and assign column names
    desc_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

    # Concatenate with original data
    data = pd.concat([data, desc_df], axis=1)

    # Prepare features for prediction
    # my input features are the Morgan fingerprints and descriptors
    desc_names = ['MolWt', 'LogP', 'TPSA', 'HBD', 'HBA']
    X_fp = np.array(data['morgan_fp'].tolist())
    desc_features = data[desc_names].values
    features = np.hstack((X_fp, desc_features))
    # Ensure features are in the correct shape
    if features.ndim == 1:
        features = features.reshape(1, -1)
    # Make predictions
    predictions = xgb_model.predict(features)

    # Add predictions to the DataFrame
    data['Bio_Marker_Value'] = predictions

    # from the dataframe,write Drug_ID,Drug,Bio_Marker_Value into a new csv file
    output_df = data[['Drug_ID', 'Drug', 'Bio_Marker_Value']]
    output_df.to_csv('predicted_biomarker_values.csv', index=False)
    print("Predictions saved to 'predicted_biomarker_values.csv'")

    return output_df


# Example usage
if __name__ == "__main__":
    csv_file_path = 'valid.csv'  # Replace with your actual CSV file path
    predictions_df = predict_drug_response(csv_file_path)