Wrap all components of making rule set 3 predictions into one function
from rs3.targetdata import write_transcript_data, write_conservation_data
design_df = pd.read_table('test_data/sgrna-designs.txt')
sanger_activity = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_activity = pd.read_csv('test_data/Aguirre2016_activity.csv')
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()
scored_designs = predict(design_df, tracr=['Hsu2013', 'Chen2013'], target=True,
n_jobs_min=2, n_jobs_max=max_n_jobs,
lite=False)
write_transcript_data(design_df, n_jobs=2,
filepath='./data/target_data/',
aa_seq_name='aa_seqs.pq',
protein_domain_name='protein_domains.pq')
write_conservation_data(design_df, n_jobs=max_n_jobs,
filepath='./data/target_data/',
cons_file_name='conservation.pq')
scored_designs_stored = predict(design_df, tracr=['Hsu2013', 'Chen2013'], target=True,
n_jobs_min=2, n_jobs_max=max_n_jobs,
aa_seq_file='./data/target_data/aa_seqs.pq',
domain_file='./data/target_data/protein_domains.pq',
conservatin_file='./data/target_data/conservation.pq',
lite=False)
pd.testing.assert_frame_equal(scored_designs, scored_designs_stored)
activity_id_cols = ['sgRNA Sequence', 'sgRNA Context Sequence',
'Target Gene Symbol', 'Target Cut %']
sanger_actvity_scores = (sanger_activity.merge(scored_designs_stored,
how='inner',
on=activity_id_cols))
gecko_activity_scores = (gecko_activity.merge(scored_designs_stored,
how='inner',
on=activity_id_cols))
import seaborn as sns
activity_col = 'avg_mean_centered_neg_lfc'
score_cols = ['RS3 Sequence Score (Hsu2013 tracr)',
'RS3 Sequence Score (Chen2013 tracr)',
'Target Score',
'RS3 Sequence (Hsu2013 tracr) + Target Score',
'RS3 Sequence (Chen2013 tracr) + Target Score']
sanger_activity_cors = sanger_actvity_scores[[activity_col] + score_cols].corr()
sns.clustermap(sanger_activity_cors, annot=True, cmap='RdBu_r', vmin=-1, vmax=1)
assert (sanger_activity_cors.loc[activity_col, 'RS3 Sequence (Chen2013 tracr) + Target Score'] >
sanger_activity_cors.loc[activity_col, 'RS3 Sequence (Hsu2013 tracr) + Target Score'] >
sanger_activity_cors.loc[activity_col, 'RS3 Sequence Score (Chen2013 tracr)'] >
sanger_activity_cors.loc[activity_col, 'RS3 Sequence Score (Hsu2013 tracr)'] >
sanger_activity_cors.loc[activity_col, 'Target Score'])
gecko_activity_cors = gecko_activity_scores[[activity_col] + score_cols].corr()
sns.clustermap(gecko_activity_cors, annot=True, cmap='RdBu_r', vmin=-1, vmax=1)
assert (gecko_activity_cors.loc[activity_col, 'RS3 Sequence (Hsu2013 tracr) + Target Score'] >
gecko_activity_cors.loc[activity_col, 'RS3 Sequence Score (Hsu2013 tracr)'] >
gecko_activity_cors.loc[activity_col, 'RS3 Sequence Score (Chen2013 tracr)'] >
gecko_activity_cors.loc[activity_col, 'RS3 Sequence (Chen2013 tracr) + Target Score'] >
gecko_activity_cors.loc[activity_col, 'Target Score'])