Module to generate target site features
from rs3 import targetdata
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()

add_target_columns[source]

add_target_columns(design_df, transcript_id_col='Target Transcript', cut_pos_col='Target Cut Length', transcript_base_col='Transcript Base')

Add ['AA Index' and 'Transcript Base'] to design df

:param design_df: DataFrame :return: DataFrame

design_df = pd.read_table('test_data/sgrna-designs.txt')
design_targ_df = add_target_columns(design_df)
assert 'AA Index' in design_targ_df.columns

Position Features

The first feature class we consider is where the guide targets within the annotated transcript

get_position_features[source]

get_position_features(sg_df, id_cols)

Get features ['Target Cut %', 'sense']

:param sg_df: DataFrame :param id_cols: list :return: DataFrame

Amino Acid Features

We calculate a set of features from the amino acid sequence around the cutsite itself

aas = ['A', 'C', 'D', 'E', 'F',
       'G', 'H', 'I', 'K', 'L',
       'M', 'N', 'P', 'Q', 'R',
       'S', 'T', 'V', 'W', 'Y', '*']

get_one_aa_frac[source]

get_one_aa_frac(feature_dict, aa_sequence, aas)

Get fraction of single aa

:param feature_dict: dict, feature dictionary :param aa_sequence: str, amino acid sequence :param aas: list, list of amino acids

one_aa_ft = {}
get_one_aa_frac(one_aa_ft, 'ACDG*-', aas)
assert one_aa_ft['A'] == 1/6
assert one_aa_ft['Q'] == 0

get_aa_aromaticity[source]

get_aa_aromaticity(feature_dict, analyzed_seq)

Get fraction of aromatic amino acids in a sequence.

Phe (F) + Trp (W) + Tyr (Y)

:param feature_dict: :param analyzed_seq: ProteinAnalysis object

get_aa_hydrophobicity[source]

get_aa_hydrophobicity(feature_dict, analyzed_seq)

Grand Average of Hydropathy

The GRAVY value is calculated by adding the hydropathy value for each residue and dividing by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the amino acid

:param feature_dict: dict :param analyzed_seq: ProteinAnalysis object

get_aa_ip[source]

get_aa_ip(feature_dict, analyzed_seq)

Get the Isoelectric Point of an amino acid sequence

Charge of amino acid

:param feature_dict: dict :param analyzed_seq: ProteinAnalysis object

get_aa_secondary_structure[source]

get_aa_secondary_structure(feature_dict, analyzed_seq)

Get the fraction of amion acids that tend to be in a helix, turn or sheet

:param feature_dict: dict :param analyzed_seq: ProteinAnalysis object

aa_biochemical_fts1 = {}
get_aa_aromaticity(aa_biochemical_fts1, ProteinAnalysis('FWYA'))
aa_biochemical_fts2 = {}
get_aa_aromaticity(aa_biochemical_fts2, ProteinAnalysis('AAAA'))
assert aa_biochemical_fts1['Aromaticity'] > aa_biochemical_fts2['Aromaticity']

featurize_aa_seqs[source]

featurize_aa_seqs(aa_sequences, features=None)

Get feature DataFrame for a list of amino acid sequences

:param aa_sequences: list of str :param features: list or None :return: DataFrame

ft_dict_df = featurize_aa_seqs(pd.Series(['ACDG*-', 'CDG*--', 'LLLLLL']))
assert ft_dict_df.loc['LLLLLL', 'Hydrophobicity'] == ft_dict_df['Hydrophobicity'].max()

extract_amino_acid_subsequence[source]

extract_amino_acid_subsequence(sg_aas, width)

Get the amino acid subsequence with a width of width on either side of the Amino Acid index

:param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence :param width: int :return: DataFrame

small_aa_seq_df = pd.DataFrame({'AA Index': [1, 5, 9],
                                    'seq': ['MAVLKYSLW']*3})
small_aa_subseq_df = extract_amino_acid_subsequence(small_aa_seq_df, 2)
actual_subseqs = small_aa_subseq_df['AA Subsequence']
expected_subseqs = ['--MAV', 'VLKYS', 'SLW*-']
assert len(actual_subseqs) == len(expected_subseqs)
assert all([a == b for a, b in zip(actual_subseqs, expected_subseqs)])

get_aa_subseq_df[source]

get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols, transcript_base_col='Transcript Base', target_transcript_col='Target Transcript', aa_index_col='AA Index')

Get the amino acid subsequences for a design dataframe

:param sg_designs: DataFrame :param aa_seq_df: DataFrame, Transcript Base and (AA) seq :param width: int, length on each side of the cut site :param transcript_base_col: str :param target_transcript_col: str :param aa_index_col: str :return: DataFrame

aa_seq_df = targetdata.build_transcript_aa_seq_df(design_targ_df, n_jobs=2)
Getting amino acid sequences
100%|██████████| 4/4 [00:03<00:00,  1.01it/s]
aa_subseq_df = get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,
                                id_cols=['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'])
assert (aa_subseq_df['AA Subsequence'].str.len() == 33).all()
assert aa_subseq_df.shape[0] == design_targ_df.shape[0]
codon_map_df = pd.read_csv('test_data/codon_map.csv')

def get_rev_comp(sgrna):
    """Get reverse compliment of a guide"""
    nt_map = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    rev_comp = ''
    for nt in sgrna:
        rev_comp += nt_map[nt]
    rev_comp = rev_comp[::-1]
    return rev_comp

codon_map = pd.Series(codon_map_df['Amino Acid'].values, index=codon_map_df['Codon']).to_dict()
row = aa_subseq_df.sample(1, random_state=1).iloc[0, :]
subseq = row['AA Subsequence']
context = row['sgRNA Context Sequence']
rc_context = get_rev_comp(context)
translations = dict()
rc_translations = dict()
for i in [0, 1, 2]:
    translations[i] = ''.join([codon_map[context[j:j+3]] for j in range(i, len(context), 3)
                               if (j + 3) <= len(context)])
    rc_translations[i] = ''.join([codon_map[rc_context[j:j+3]] for j in range(i, len(rc_context), 3)
                                  if (j + 3) <= len(rc_context)])
assert ((translations[0] in subseq) or (translations[1] in subseq) or (translations[2] in subseq) or
        (rc_translations[0] in subseq) or (rc_translations[1] in subseq) or (rc_translations[2] in subseq))

get_amino_acid_features[source]

get_amino_acid_features(aa_subseq_df, features, id_cols)

Featurize amino acid sequences

:param aa_subseq_df: DataFrame :param features: list :param id_cols: list :return: DataFrame

aa_features = get_amino_acid_features(aa_subseq_df=aa_subseq_df,
                                      features=['Pos. Ind. 1mer',
                                                'Hydrophobicity', 'Aromaticity',
                                                'Isoelectric Point', 'Secondary Structure'],
                                      id_cols=['sgRNA Context Sequence', 'Target Cut Length',
                                               'Target Transcript', 'Orientation'])
assert aa_features['L'].idxmax() == aa_features['Hydrophobicity'].idxmax()

Protein Domain Features

get_protein_domain_features[source]

get_protein_domain_features(sg_design_df, protein_domains, id_cols, sources=None, transcript_base_col='Transcript Base', aa_index_col='AA Index', domain_type_col='type', domain_start_col='start', domain_end_col='end')

Get binary dataframe of protein domains

:param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col] :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col] :param id_cols: list :param sources: list. list of database types to include :param transcript_base_col: str :param aa_index_col: str :param domain_type_col: str :param domain_start_col: str :param domain_end_col: str :return: DataFrame, with binary features for protein domains

domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
protein_domain_feature_df = get_protein_domain_features(design_targ_df, domain_df, sources=None,
                                                        id_cols=['sgRNA Context Sequence', 'Target Cut Length',
                                                                 'AA Index', 'Target Transcript', 'Orientation'])
Getting protein domains
100%|██████████| 200/200 [00:49<00:00,  4.04it/s]
assert protein_domain_feature_df.loc[protein_domain_feature_df['sgRNA Context Sequence'] == 'AAAAGAGCCATGAATCTAAACATCAGGAAT',
                                     ['PANTHER', 'ncoils', 'Seg', 'MobiDBLite']].sum(axis=1).values[0] == 4

Conservation Features

get_conservation_ranges[source]

get_conservation_ranges(cut_pos, small_width, large_width)

get_conservation_features[source]

get_conservation_features(sg_designs, conservation_df, conservation_column, small_width, large_width, id_cols)

Get conservation features

:param sg_designs: DataFrame :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position :param conservation_column: str, name of column to calculate scores with :param small_width: int, small window length to average scores in one direction :param large_width: int, large window length to average scores in the one direction :return: DataFrame of conservation features

conservation_df = targetdata.build_conservation_df(design_targ_df, n_jobs=max_n_jobs)
conservation_features = get_conservation_features(design_targ_df, conservation_df,
                                                  small_width=2, large_width=16,
                                                  conservation_column='ranked_conservation',
                                                  id_cols=['sgRNA Context Sequence', 'Target Cut Length',
                                                           'Target Transcript', 'Orientation'])
merged_features = protein_domain_feature_df.merge(conservation_features, how='inner', on=['sgRNA Context Sequence',
                                                                                          'Target Cut Length',
                                                                                          'Target Transcript',
                                                                                          'Orientation'])
smart_avg_cons = merged_features.loc[merged_features['Smart'].astype(bool), 'cons_32'].mean()
non_smart_avg_cons = merged_features.loc[~merged_features['Smart'].astype(bool), 'cons_32'].mean()
assert smart_avg_cons > non_smart_avg_cons
Getting conservation
100%|██████████| 200/200 [03:35<00:00,  1.08s/it]

Combining target features

We'll combine, the position, amino acid and domain feature matrices into a single target feature matrix

merge_feature_dfs[source]

merge_feature_dfs(design_df, aa_subseq_df, aa_features=None, domain_df=None, conservation_df=None, id_cols=None)

feature_df, feature_list = merge_feature_dfs(design_df=design_df,
                                             aa_subseq_df=aa_subseq_df,
                                             domain_df=protein_domain_feature_df,
                                             conservation_df=conservation_features)
assert feature_df[feature_list].shape[1] == len(feature_list)