Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Foun 1338 enable multiple humanization variants in biophi #48

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 86 additions & 8 deletions biophi/humanization/cli/sapiens.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,46 @@
from functools import partial
from multiprocessing import Pool
import os
from pathlib import Path
import sys
from typing import List, Optional

import click
from abnumber import Chain, ChainParseError, SUPPORTED_CDR_DEFINITIONS, SUPPORTED_SCHEMES
from Bio import SeqIO
import click
import pandas as pd

from biophi.common.utils.formatting import logo
from biophi.common.utils.io import parse_antibody_files, write_sheets
from biophi.common.utils.seq import iterate_fasta
from biophi.humanization.cli.oasis import show_unpaired_warning
from biophi.humanization.methods.humanization import (
generate_samples,
generate_seq_records_from_sequence_strings,
humanize_chain,
SapiensHumanizationParams,
HumanizationParams
)
from biophi.humanization.methods.humanness import OASisParams
from biophi.humanization.methods.humanization import humanize_chain, SapiensHumanizationParams, HumanizationParams
from abnumber import Chain, ChainParseError, SUPPORTED_CDR_DEFINITIONS, SUPPORTED_SCHEMES
import os
import sys
from tqdm import tqdm
from biophi.humanization.web.tasks import humanize_antibody_task, HumanizeAntibodyTaskResult

from tqdm import tqdm

@click.command()
@click.argument('inputs', required=False, nargs=-1)
@click.option('--output', required=False, help='Output directory path. With --fasta-only, output FASTA file path.')
@click.option('--fasta-only', is_flag=True, default=False, type=bool, help='Output only a FASTA file with humanized sequences (speeds up processing)')
@click.option('--scores-only', is_flag=True, default=False, type=bool, help='Output only a CSV file with Sapiens position*residue scores')
@click.option('--mean-score-only', is_flag=True, default=False, type=bool, help='Output only a CSV file with one Sapiens score per sequence')
@click.option('--generate-only', is_flag=True, default=False, type=bool, help='Generate humanized sequences based on the sapiens BERT model')
@click.option('--oasis-db', required=False, help='OAS peptide database connection string (required to run OASis)')
@click.option('--version', default='latest', help='Sapiens trained model name')
@click.option('--iterations', type=int, default=1, help='Run Sapiens given number of times to discover more humanizing mutations')
@click.option('--scheme', default=HumanizationParams.cdr_definition, help=f'Numbering scheme: one of {", ".join(SUPPORTED_SCHEMES)}')
@click.option('--cdr-definition', default=HumanizationParams.cdr_definition, help=f'CDR definition: one of {", ".join(SUPPORTED_CDR_DEFINITIONS)}')
@click.option('--humanize-cdrs', is_flag=True, default=False, type=bool, help='Allow humanizing mutations in CDRs')
@click.option('--limit', required=False, metavar='N', type=int, help='Process only first N records')
def sapiens(inputs, output, fasta_only, scores_only, mean_score_only, version, iterations, scheme, cdr_definition, humanize_cdrs, limit, oasis_db):
@click.option('--n_generated_sequences', required=False, type=int, default=1000, help='Number of sequences to output when using --generate-only')
def sapiens(inputs, output, fasta_only, scores_only, mean_score_only, generate_only, version, iterations, scheme, cdr_definition, humanize_cdrs, limit, oasis_db, n_generated_sequences):
"""Sapiens: Antibody humanization using deep learning.

Sapiens is trained on 20 million natural antibody sequences
Expand All @@ -51,6 +61,10 @@ def sapiens(inputs, output, fasta_only, scores_only, mean_score_only, version, i
# Humanize FASTA file(s), save to directory along with OASis humanness report
biophi sapiens input.fa --output ./report/ \\
--oasis-db sqlite:////Absolute/path/to/oas_human_subject_9mers_2019_11.db

\b
# Generate humanized sequences FASTA file
biophi sapiens input.fa --generate-only --output humanized_sequences

INPUTS: Input FASTA file path(s). If not provided, creates an interactive session.
"""
Expand All @@ -72,6 +86,8 @@ def sapiens(inputs, output, fasta_only, scores_only, mean_score_only, version, i
click.echo(f'- Producing Sapiens scores only', err=True)
if fasta_only:
raise ValueError('Cannot use --fasta-only together with --scores-only')
if generate_only:
raise ValueError('Cannot use --generate-only together with --scores-only')
if iterations != 1:
raise ValueError('Iterations cannot be used with --scores-only')
iterations = 0
Expand Down Expand Up @@ -108,6 +124,14 @@ def sapiens(inputs, output, fasta_only, scores_only, mean_score_only, version, i
limit=limit,
humanization_params=humanization_params
)
elif generate_only:
return sapiens_generate_only(
inputs,
output,
limit=limit,
humanization_params=humanization_params,
initial_n_samples=n_generated_sequences,
)
else:
from biophi.common.web.views import app
with app.app_context():
Expand Down Expand Up @@ -212,6 +236,60 @@ def sapiens_fasta_only(inputs, output_fasta, humanization_params, limit=None):
click.echo('Done.', err=True)


def sapiens_generate_only(
inputs: List[str],
output_fasta: str,
humanization_params: HumanizationParams,
limit: Optional[int]=None,
initial_n_samples: int=1000
) -> None:
"""Generates initial_n_samples sequences using the output of the sapiens model.

The model outputs a probability matrix where for each residue of the input sequence
we are given the probabilities of an amino acid being in that position.
We use this matrix as a sampling distribution to generate new sequences. We try to generate
initial_n_samples sequences, and drop any duplicates that are generated.

Args:
inputs: List of fasta files with sequences to humanize.
output_fasta: Fasta file to write the generated sequences to.
humanization_params: The params used to humanize the input sequences.
limit: Number of input sequences to process from inputs. Defaults to None.
initial_n_samples: target number of samples to generate. Defaults to 1000.

"""
if output_fasta:
output_dir = Path(output_fasta)
output_dir.mkdir(parents=True, exist_ok=True)

chains = [Chain(
record.seq,
name=record.id,
scheme=humanization_params.scheme,
cdr_definition=humanization_params.cdr_definition
) for record in tqdm(iterate_fasta(inputs, limit=limit))]

for chain in tqdm(chains, disable=(not output_fasta)):
humanization = humanize_chain(chain, params=humanization_params)
scores = humanization.to_score_dataframe()
generated_samples = generate_samples(input_df=scores, initial_n_samples=initial_n_samples)
chain_label = 'VH' if humanization.humanized_chain.is_heavy_chain() else 'VL'

records = generate_seq_records_from_sequence_strings(
sequences = generated_samples,
id_prefix = f'{chain.name}{humanization.humanized_chain.tail}',
description = f'{chain_label} {humanization_params.get_export_name().replace("_"," ")}'.strip()
)

if output_fasta:
f = open(output_dir / f"{chain.name}_generated.fasta", 'wt')
SeqIO.write(records, f if output_fasta else sys.stdout, 'fasta-2line')

if output_fasta:
click.echo(f'Saved {len(records):,} humanized chains to: {output_fasta}', err=True)
click.echo('Done.', err=True)


def sapiens_full(inputs, output_dir, humanization_params, oasis_params, limit=None):
if oasis_params is None:
raise ValueError('Use --oasis-db PATH_TO_OASIS.db to get full output, or consider using --fasta-only')
Expand Down
60 changes: 56 additions & 4 deletions biophi/humanization/methods/humanization.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from functools import cached_property
from typing import Dict, List, Optional
import pandas as pd
import requests
from dataclasses import dataclass

from abnumber.chain import Chain, Alignment, Position
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from dataclasses import dataclass
from flask import current_app
import numpy as np
from numpy.random import default_rng
import pandas as pd
import sapiens

from biophi.common.utils.formatting import get_valid_filename
import sapiens


@dataclass
Expand Down Expand Up @@ -253,3 +257,51 @@ def sapiens_predict_chain(chain, model_version='latest', return_all_hiddens=Fals
model_version=model_version,
return_all_hiddens=return_all_hiddens
)


def generate_samples(input_df: pd.DataFrame, initial_n_samples: int) -> List[str]:
input_ndarray = input_df.to_numpy(dtype="float32")
n_rows = input_ndarray.shape[0]
n_possible_aa_residues = input_ndarray.shape[1]
aa_array=input_df.columns.values
rng = default_rng(seed=42)

# Ensure that we the probabilities for each residue sum to 1
fixed_probabilities = input_ndarray / input_ndarray.sum(axis=1,keepdims=1)
# make an ndarray of shape (n_samples, n_rows), where each row corresponds to a possible sequence,
# and each value in the row corresponds to the aa to be chosen from the aa_array
# If sampled[0] = [1,4,5], then that sequence would correspond to CFG
# this could be simplified with something like
# https://stackoverflow.com/questions/34187130/fast-random-weighted-selection-across-all-rows-of-a-stochastic-matrix?noredirect=1&lq=1
# https://stackoverflow.com/questions/47722005/vectorizing-numpy-random-choice-for-given-2d-array-of-probabilities-along-an-a
# https://stackoverflow.com/questions/40474436/how-to-apply-numpy-random-choice-to-a-matrix-of-probability-values-vectorized-s
list_of_random_samples = [
rng.choice(
n_possible_aa_residues,
initial_n_samples,
p=fixed_probabilities[i]
) for i in range(n_rows)
]
sampled = np.stack(list_of_random_samples, axis=1)
# Get unique_sequences, many of the sequences will be duplicates given that each
# residue normally has one aa with a very high probability
unique_rows = np.unique(sampled, axis=0)
# Convert indices of aa_array to strings
sequences = ["".join(np.take(aa_array, unique_rows[i])) for i in range(unique_rows.shape[0])]
return sequences

def generate_seq_records_from_sequence_strings(
sequences: List[str],
id_prefix: str,
description: str,
) -> List[SeqRecord]:
return [
SeqRecord(
Seq(seq),
id=id_prefix,
description=(
f'{description} {i}'
)
)
for i, seq in enumerate(sequences)
]