Module to generate target site features
from rs3 import targetdata
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
design_df = pd.read_table('test_data/sgrna-designs.txt')
design_targ_df = add_target_columns(design_df)
assert 'AA Index' in design_targ_df.columns
aas = ['A', 'C', 'D', 'E', 'F',
'G', 'H', 'I', 'K', 'L',
'M', 'N', 'P', 'Q', 'R',
'S', 'T', 'V', 'W', 'Y', '*']
one_aa_ft = {}
get_one_aa_frac(one_aa_ft, 'ACDG*-', aas)
assert one_aa_ft['A'] == 1/6
assert one_aa_ft['Q'] == 0
aa_biochemical_fts1 = {}
get_aa_aromaticity(aa_biochemical_fts1, ProteinAnalysis('FWYA'))
aa_biochemical_fts2 = {}
get_aa_aromaticity(aa_biochemical_fts2, ProteinAnalysis('AAAA'))
assert aa_biochemical_fts1['Aromaticity'] > aa_biochemical_fts2['Aromaticity']
ft_dict_df = featurize_aa_seqs(pd.Series(['ACDG*-', 'CDG*--', 'LLLLLL']))
assert ft_dict_df.loc['LLLLLL', 'Hydrophobicity'] == ft_dict_df['Hydrophobicity'].max()
small_aa_seq_df = pd.DataFrame({'AA Index': [1, 5, 9],
'seq': ['MAVLKYSLW']*3})
small_aa_subseq_df = extract_amino_acid_subsequence(small_aa_seq_df, 2)
actual_subseqs = small_aa_subseq_df['AA Subsequence']
expected_subseqs = ['--MAV', 'VLKYS', 'SLW*-']
assert len(actual_subseqs) == len(expected_subseqs)
assert all([a == b for a, b in zip(actual_subseqs, expected_subseqs)])
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_targ_df, n_jobs=2)
aa_subseq_df = get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,
id_cols=['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'])
assert (aa_subseq_df['AA Subsequence'].str.len() == 33).all()
assert aa_subseq_df.shape[0] == design_targ_df.shape[0]
codon_map_df = pd.read_csv('test_data/codon_map.csv')
def get_rev_comp(sgrna):
"""Get reverse compliment of a guide"""
nt_map = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
rev_comp = ''
for nt in sgrna:
rev_comp += nt_map[nt]
rev_comp = rev_comp[::-1]
return rev_comp
codon_map = pd.Series(codon_map_df['Amino Acid'].values, index=codon_map_df['Codon']).to_dict()
row = aa_subseq_df.sample(1, random_state=1).iloc[0, :]
subseq = row['AA Subsequence']
context = row['sgRNA Context Sequence']
rc_context = get_rev_comp(context)
translations = dict()
rc_translations = dict()
for i in [0, 1, 2]:
translations[i] = ''.join([codon_map[context[j:j+3]] for j in range(i, len(context), 3)
if (j + 3) <= len(context)])
rc_translations[i] = ''.join([codon_map[rc_context[j:j+3]] for j in range(i, len(rc_context), 3)
if (j + 3) <= len(rc_context)])
assert ((translations[0] in subseq) or (translations[1] in subseq) or (translations[2] in subseq) or
(rc_translations[0] in subseq) or (rc_translations[1] in subseq) or (rc_translations[2] in subseq))
aa_features = get_amino_acid_features(aa_subseq_df=aa_subseq_df,
features=['Pos. Ind. 1mer',
'Hydrophobicity', 'Aromaticity',
'Isoelectric Point', 'Secondary Structure'],
id_cols=['sgRNA Context Sequence', 'Target Cut Length',
'Target Transcript', 'Orientation'])
assert aa_features['L'].idxmax() == aa_features['Hydrophobicity'].idxmax()
domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
protein_domain_feature_df = get_protein_domain_features(design_targ_df, domain_df, sources=None,
id_cols=['sgRNA Context Sequence', 'Target Cut Length',
'AA Index', 'Target Transcript', 'Orientation'])
assert protein_domain_feature_df.loc[protein_domain_feature_df['sgRNA Context Sequence'] == 'AAAAGAGCCATGAATCTAAACATCAGGAAT',
['PANTHER', 'ncoils', 'Seg', 'MobiDBLite']].sum(axis=1).values[0] == 4
conservation_df = targetdata.build_conservation_df(design_targ_df, n_jobs=max_n_jobs)
conservation_features = get_conservation_features(design_targ_df, conservation_df,
small_width=2, large_width=16,
conservation_column='ranked_conservation',
id_cols=['sgRNA Context Sequence', 'Target Cut Length',
'Target Transcript', 'Orientation'])
merged_features = protein_domain_feature_df.merge(conservation_features, how='inner', on=['sgRNA Context Sequence',
'Target Cut Length',
'Target Transcript',
'Orientation'])
smart_avg_cons = merged_features.loc[merged_features['Smart'].astype(bool), 'cons_32'].mean()
non_smart_avg_cons = merged_features.loc[~merged_features['Smart'].astype(bool), 'cons_32'].mean()
assert smart_avg_cons > non_smart_avg_cons
feature_df, feature_list = merge_feature_dfs(design_df=design_df,
aa_subseq_df=aa_subseq_df,
domain_df=protein_domain_feature_df,
conservation_df=conservation_features)
assert feature_df[feature_list].shape[1] == len(feature_list)