Rule set 3 target-site predictions
import lightgbm
import pandas as pd
from rs3 import targetdata
from scipy import stats
import numpy as np
__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor
design_df = pd.read_table('test_data/sgrna-designs.txt')
design_targ_df = targetfeat.add_target_columns(design_df)
id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)
aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,
id_cols=id_cols)
aa_subseq_df
domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,
id_cols=id_cols)
conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)
conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,
small_width=2, large_width=16,
conservation_column='ranked_conservation',
id_cols=id_cols)
conservation_feature_df
predictions = predict_target(design_df=design_df,
aa_subseq_df=aa_subseq_df,
domain_feature_df=domain_feature_df,
conservation_feature_df=conservation_feature_df)
design_df['Target Score'] = predictions
lite_predictions = predict_target(design_df=design_df,
aa_subseq_df=aa_subseq_df)
design_df['Target Score Lite'] = lite_predictions
design_df['sgRNA Context Sequence']
assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7
sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')
sanger_designs = sanger_df.merge(design_df, how='inner',
on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
'Target Cut %'])
gecko_designs = gecko_df.merge(design_df, how='inner',
on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
'Target Cut %'])
assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],
sanger_designs['Target Score'])[0] > 0.2
assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],
gecko_designs['Target Score'])[0] > 0.05
rs_dev_target_lite_predictions = (pd.read_csv('test_data/target_lite_score_export.csv')
.rename({'Target Lite Score': 'Target Score Lite'}, axis=1))
rs_dev_target_predictions = pd.read_csv('test_data/target_score_export.csv')
merged_rs_dev_predictions = rs_dev_target_lite_predictions.merge(rs_dev_target_predictions,
how='inner')
merged_rs_dev_rs3_predictions = (design_df
.merge(merged_rs_dev_predictions,
how='inner',
on=['sgRNA Context Sequence', 'Target Cut Length',
'Target Transcript', 'Orientation'],
suffixes=[' rs3', ' rs_dev']))
assert np.allclose(merged_rs_dev_rs3_predictions['Target Score rs3'], merged_rs_dev_rs3_predictions['Target Score rs_dev'])
assert np.allclose(merged_rs_dev_rs3_predictions['Target Score Lite rs3'], merged_rs_dev_rs3_predictions['Target Score Lite rs_dev'])